mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Unified separator ans ASN barcode parsing
so that barcode parsing won't run twice
This commit is contained in:
parent
92b9fc1ba9
commit
5004771d79
@ -194,25 +194,51 @@ def scan_file_for_barcodes(
|
||||
return pdf_filepath, barcodes
|
||||
|
||||
|
||||
def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], List[int]]:
|
||||
def get_separating_barcodes(barcodes: List[Tuple[int, str]]) -> List[int]:
|
||||
"""
|
||||
Scan the provided pdf file for page separating barcodes
|
||||
Returns a PDF filepath and a list of pagenumbers,
|
||||
which separate the file into new files
|
||||
Search the parsed barcodes for separators
|
||||
and returns a list of pagenumbers, which
|
||||
separate the file into new files
|
||||
"""
|
||||
separator_page_numbers = []
|
||||
|
||||
pdf_filepath, barcodes = scan_file_for_barcodes(filepath)
|
||||
|
||||
# filter all barcodes for the separator string
|
||||
separator_barcodes = list(
|
||||
filter(lambda bc: bc[1] == settings.CONSUMER_BARCODE_STRING, barcodes),
|
||||
)
|
||||
|
||||
# get the page numbers of the separating barcodes
|
||||
separator_page_numbers = [page for page, _ in separator_barcodes]
|
||||
|
||||
return pdf_filepath, separator_page_numbers
|
||||
return separator_page_numbers
|
||||
|
||||
|
||||
def get_asn_from_barcodes(barcodes: List[Tuple[int, str]]) -> Optional[int]:
|
||||
"""
|
||||
Search the parsed barcodes for any ASNs.
|
||||
The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
|
||||
is considered the ASN to be used.
|
||||
Returns the detected ASN (or None)
|
||||
"""
|
||||
asn = None
|
||||
|
||||
# only the barcode text is important here -> discard the page number
|
||||
barcodes = [text for _, text in barcodes]
|
||||
# get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
|
||||
asn_text = next(
|
||||
(x for x in barcodes if x.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX)),
|
||||
None,
|
||||
)
|
||||
|
||||
if asn_text:
|
||||
logger.debug(f"Found ASN Barcode: {asn_text}")
|
||||
# remove the prefix and remove whitespace
|
||||
asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
|
||||
|
||||
# now, try parsing the ASN number
|
||||
try:
|
||||
asn = int(asn_text)
|
||||
except ValueError as e:
|
||||
logger.warn(f"Failed to parse ASN number because: {e}")
|
||||
|
||||
return asn
|
||||
|
||||
|
||||
def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
|
||||
@ -293,36 +319,3 @@ def save_to_dir(
|
||||
os.rename(dst, dst_new)
|
||||
else:
|
||||
logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.")
|
||||
|
||||
|
||||
def scan_file_for_asn_barcode(filepath: str) -> Tuple[Optional[str], Optional[int]]:
|
||||
"""
|
||||
Scan the provided pdf file for barcodes that contain the ASN
|
||||
for this document.
|
||||
The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
|
||||
is considered the ASN to be used.
|
||||
Returns a PDF filepath and the detected ASN (or None)
|
||||
"""
|
||||
asn = None
|
||||
|
||||
pdf_filepath, barcodes = scan_file_for_barcodes(filepath)
|
||||
# only the barcode text is important here -> discard the page number
|
||||
barcodes = [text for _, text in barcodes]
|
||||
# get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
|
||||
asn_text = next(
|
||||
(x for x in barcodes if x.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX))
|
||||
)
|
||||
|
||||
logger.debug(f"Found ASN Barcode: {asn_text}")
|
||||
|
||||
if asn_text:
|
||||
# remove the prefix and remove whitespace
|
||||
asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
|
||||
|
||||
# now, try parsing the ASN number
|
||||
try:
|
||||
asn = int(asn_text)
|
||||
except ValueError as e:
|
||||
logger.warn(f"Failed to parse ASN number because: {e}")
|
||||
|
||||
return pdf_filepath, asn
|
||||
|
@ -98,6 +98,7 @@ def consume_file(
|
||||
):
|
||||
|
||||
path = Path(path).resolve()
|
||||
asn = None
|
||||
|
||||
# Celery converts this to a string, but everything expects a datetime
|
||||
# Long term solution is to not use JSON for the serializer but pickle instead
|
||||
@ -109,12 +110,15 @@ def consume_file(
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# check for separators in current document
|
||||
# read all barcodes in the current document
|
||||
if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE:
|
||||
pdf_filepath, parsed_barcodes = barcodes.scan_file_for_barcodes(path)
|
||||
|
||||
# split document by separator pages, if enabled
|
||||
if settings.CONSUMER_ENABLE_BARCODES:
|
||||
separators = barcodes.get_separating_barcodes(parsed_barcodes)
|
||||
|
||||
pdf_filepath, separators = barcodes.scan_file_for_separating_barcodes(path)
|
||||
|
||||
if separators:
|
||||
if len(separators) > 0:
|
||||
logger.debug(
|
||||
f"Pages with separators found in: {str(path)}",
|
||||
)
|
||||
@ -175,12 +179,11 @@ def consume_file(
|
||||
# the barcodes has been split and will be consumed separately
|
||||
return "File successfully split"
|
||||
|
||||
# try reading ASN barcodes
|
||||
asn = None
|
||||
# try reading the ASN from barcode
|
||||
if settings.CONSUMER_ENABLE_ASN_BARCODE:
|
||||
_, asn = barcodes.scan_file_for_asn_barcode(path)
|
||||
asn = barcodes.get_asn_from_barcodes(parsed_barcodes)
|
||||
if asn:
|
||||
logger.info(f"Using ASN {asn} from barcode")
|
||||
logger.info(f"Found ASN in barcode: {asn}")
|
||||
|
||||
# continue with consumption if no barcode was found
|
||||
document = Consumer().try_consume_file(
|
||||
@ -192,7 +195,7 @@ def consume_file(
|
||||
override_tag_ids=override_tag_ids,
|
||||
task_id=task_id,
|
||||
override_created=override_created,
|
||||
override_asn=asn
|
||||
override_asn=asn,
|
||||
)
|
||||
|
||||
if document:
|
||||
|
Loading…
x
Reference in New Issue
Block a user