Updates handling of barcodes to encapsulate logic, moving it out of tasks and into barcodes

This commit is contained in:
Trenton H 2023-05-19 09:59:57 -07:00
parent 58f95c1891
commit 07e07fc7e8
21 changed files with 589 additions and 827 deletions

View File

@ -1,12 +1,11 @@
import logging import logging
import os
import shutil import shutil
import tempfile import tempfile
from dataclasses import dataclass from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path from pathlib import Path
from subprocess import run from subprocess import run
from typing import Dict from typing import Dict
from typing import Final
from typing import List from typing import List
from typing import Optional from typing import Optional
@ -18,13 +17,11 @@ from pikepdf import Page
from pikepdf import Pdf from pikepdf import Pdf
from PIL import Image from PIL import Image
from documents.data_models import DocumentSource
logger = logging.getLogger("paperless.barcodes") logger = logging.getLogger("paperless.barcodes")
class BarcodeImageFormatError(Exception):
pass
@dataclass(frozen=True) @dataclass(frozen=True)
class Barcode: class Barcode:
""" """
@ -51,56 +48,72 @@ class Barcode:
return self.value.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX) return self.value.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX)
@dataclass class BarcodeReader:
class DocumentBarcodeInfo: def __init__(self, filepath: Path, mime_type: str) -> None:
""" self.file: Final[Path] = filepath
Describes a single document's barcode status self.mime: Final[str] = mime_type
""" self.pdf_file: Path = self.file
self.barcodes: List[Barcode] = []
self.temp_dir: Optional[Path] = None
pdf_path: Path if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
barcodes: List[Barcode] self.SUPPORTED_FILE_MIMES = {"application/pdf", "image/tiff"}
else:
self.SUPPORTED_FILE_MIMES = {"application/pdf"}
def __enter__(self):
if self.supported_mime_type:
self.temp_dir = tempfile.TemporaryDirectory(prefix="paperless-barcodes")
return self
@lru_cache(maxsize=8) def __exit__(self, exc_type, exc_val, exc_tb):
def supported_file_type(mime_type: str) -> bool: if self.temp_dir is not None:
""" self.temp_dir.cleanup()
Determines if the file is valid for barcode self.temp_dir = None
processing, based on MIME type and settings
:return: True if the file is supported, False otherwise @property
""" def supported_mime_type(self) -> bool:
supported_mime = ["application/pdf"] """
if settings.CONSUMER_BARCODE_TIFF_SUPPORT: Return True if the given mime type is supported for barcodes, false otherwise
supported_mime += ["image/tiff"] """
return self.mime in self.SUPPORTED_FILE_MIMES
return mime_type in supported_mime @property
def asn(self) -> Optional[int]:
"""
Search the parsed barcodes for any ASNs.
The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
is considered the ASN to be used.
Returns the detected ASN (or None)
"""
asn = None
# Ensure the barcodes have been read
self.detect()
def barcode_reader(image: Image) -> List[str]: # get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
""" asn_text = next(
Read any barcodes contained in image (x.value for x in self.barcodes if x.is_asn),
Returns a list containing all found barcodes None,
""" )
barcodes = []
if settings.CONSUMER_BARCODE_SCANNER == "PYZBAR": if asn_text:
logger.debug("Scanning for barcodes using PYZBAR") logger.debug(f"Found ASN Barcode: {asn_text}")
from pyzbar import pyzbar # remove the prefix and remove whitespace
asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
# Decode the barcode image # now, try parsing the ASN number
detected_barcodes = pyzbar.decode(image) try:
asn = int(asn_text)
except ValueError as e:
logger.warning(f"Failed to parse ASN number because: {e}")
return asn
@staticmethod
def read_barcodes_zxing(image: Image) -> List[str]:
barcodes = []
if detected_barcodes:
# Traverse through all the detected barcodes in image
for barcode in detected_barcodes:
if barcode.data:
decoded_barcode = barcode.data.decode("utf-8")
barcodes.append(decoded_barcode)
logger.debug(
f"Barcode of type {str(barcode.type)} found: {decoded_barcode}",
)
elif settings.CONSUMER_BARCODE_SCANNER == "ZXING":
logger.debug("Scanning for barcodes using ZXING")
import zxingcpp import zxingcpp
detected_barcodes = zxingcpp.read_barcodes(image) detected_barcodes = zxingcpp.read_barcodes(image)
@ -111,74 +124,92 @@ def barcode_reader(image: Image) -> List[str]:
f"Barcode of type {str(barcode.format)} found: {barcode.text}", f"Barcode of type {str(barcode.format)} found: {barcode.text}",
) )
return barcodes return barcodes
@staticmethod
def read_barcodes_pyzbar(image: Image) -> List[str]:
barcodes = []
def convert_from_tiff_to_pdf(filepath: Path) -> Path: from pyzbar import pyzbar
"""
converts a given TIFF image file to pdf into a temporary directory.
Returns the new pdf file. # Decode the barcode image
""" detected_barcodes = pyzbar.decode(image)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
# use old file name with pdf extension
newpath = Path(tempdir) / Path(filepath.name).with_suffix(".pdf")
with Image.open(filepath) as im: # Traverse through all the detected barcodes in image
has_alpha_layer = im.mode in ("RGBA", "LA") for barcode in detected_barcodes:
if has_alpha_layer: if barcode.data:
run( decoded_barcode = barcode.data.decode("utf-8")
[ barcodes.append(decoded_barcode)
settings.CONVERT_BINARY, logger.debug(
"-alpha", f"Barcode of type {str(barcode.type)} found: {decoded_barcode}",
"off", )
filepath,
filepath,
],
)
with filepath.open("rb") as img_file, newpath.open("wb") as pdf_file:
pdf_file.write(img2pdf.convert(img_file))
return newpath
return barcodes
def scan_file_for_barcodes( def convert_from_tiff_to_pdf(self):
filepath: Path, """
mime_type: str, May convert a TIFF image into a PDF, if the input is a TIFF
) -> DocumentBarcodeInfo: """
""" # Nothing to do, pdf_file is already assigned correctly
Scan the provided pdf file for any barcodes if self.mime != "image/tiff":
Returns a PDF filepath and a list of return
(page_number, barcode_text) tuples
"""
def _pdf2image_barcode_scan(pdf_filepath: str) -> List[Barcode]: with Image.open(self.file) as im:
detected_barcodes = [] has_alpha_layer = im.mode in ("RGBA", "LA")
# use a temporary directory in case the file is too big to handle in memory if has_alpha_layer:
with tempfile.TemporaryDirectory() as path: # Note the save into the temp folder, so as not to trigger a new
pages_from_path = convert_from_path( # consume
pdf_filepath, scratch_image = Path(self.temp_dir.name) / Path(self.file.name)
dpi=300, run(
output_folder=path, [
settings.CONVERT_BINARY,
"-alpha",
"off",
self.file,
scratch_image,
],
) )
else:
# Not modifying the original, safe to use in place
scratch_image = self.file
self.pdf_file = Path(self.temp_dir.name) / Path(self.file.name).with_suffix(
".pdf",
)
with scratch_image.open("rb") as img_file, self.pdf_file.open("wb") as pdf_file:
pdf_file.write(img2pdf.convert(img_file))
def detect(self) -> None:
"""
Scan all pages of the PDF as images, updating barcodes and the pages
found on as we go
"""
# Bail if barcodes already exist
if self.barcodes:
return
# Choose the library for reading
if settings.CONSUMER_BARCODE_SCANNER == "PYZBAR":
reader = self.read_barcodes_pyzbar
logger.debug("Scanning for barcodes using PYZBAR")
else:
reader = self.read_barcodes_zxing
logger.debug("Scanning for barcodes using ZXING")
try:
pages_from_path = convert_from_path(
self.pdf_file,
dpi=300,
output_folder=self.temp_dir.name,
)
for current_page_number, page in enumerate(pages_from_path): for current_page_number, page in enumerate(pages_from_path):
for barcode_value in barcode_reader(page): for barcode_value in reader(page):
detected_barcodes.append( self.barcodes.append(
Barcode(current_page_number, barcode_value), Barcode(current_page_number, barcode_value),
) )
return detected_barcodes
pdf_filepath = None
barcodes = []
if supported_file_type(mime_type):
pdf_filepath = filepath
if mime_type == "image/tiff":
pdf_filepath = convert_from_tiff_to_pdf(filepath)
# Always try pikepdf first, it's usually fine, faster and
# uses less memory
try:
barcodes = _pdf2image_barcode_scan(pdf_filepath)
# Password protected files can't be checked # Password protected files can't be checked
# This is the exception raised for those # This is the exception raised for those
except PDFPageCountError as e: except PDFPageCountError as e:
@ -191,141 +222,130 @@ def scan_file_for_barcodes(
logger.warning( logger.warning(
f"Exception during barcode scanning: {e}", f"Exception during barcode scanning: {e}",
) )
else:
logger.warning(
f"Unsupported file format for barcode reader: {str(mime_type)}",
)
return DocumentBarcodeInfo(pdf_filepath, barcodes) def get_separation_pages(self) -> Dict[int, bool]:
"""
Search the parsed barcodes for separators and returns a dict of page
numbers, which separate the file into new files, together with the
information whether to keep the page.
"""
# filter all barcodes for the separator string
# get the page numbers of the separating barcodes
separator_pages = {bc.page: False for bc in self.barcodes if bc.is_separator}
if not settings.CONSUMER_ENABLE_ASN_BARCODE:
return separator_pages
# add the page numbers of the ASN barcodes
# (except for first page, that might lead to infinite loops).
return {
**separator_pages,
**{bc.page: True for bc in self.barcodes if bc.is_asn and bc.page != 0},
}
def get_separating_barcodes(barcodes: List[Barcode]) -> Dict[int, bool]: def separate_pages(self, pages_to_split_on: Dict[int, bool]) -> List[Path]:
""" """
Search the parsed barcodes for separators Separate the provided pdf file on the pages_to_split_on.
and returns a dict of page numbers, which The pages which are defined by the keys in page_numbers
separate the file into new files, together will be removed if the corresponding value is false.
with the information whether to keep the page. Returns a list of (temporary) filepaths to consume.
""" These will need to be deleted later.
# filter all barcodes for the separator string """
# get the page numbers of the separating barcodes
separator_pages = {bc.page: False for bc in barcodes if bc.is_separator}
if not settings.CONSUMER_ENABLE_ASN_BARCODE:
return separator_pages
# add the page numbers of the ASN barcodes document_paths = []
# (except for first page, that might lead to infinite loops). fname = self.file.with_suffix("").name
return { with Pdf.open(self.pdf_file) as input_pdf:
**separator_pages, # Start with an empty document
**{bc.page: True for bc in barcodes if bc.is_asn and bc.page != 0}, current_document: List[Page] = []
} # A list of documents, ie a list of lists of pages
documents: List[List[Page]] = [current_document]
for idx, page in enumerate(input_pdf.pages):
# Keep building the new PDF as long as it is not a
# separator index
if idx not in pages_to_split_on:
current_document.append(page)
continue
def get_asn_from_barcodes(barcodes: List[Barcode]) -> Optional[int]: # This is a split index
""" # Start a new destination page listing
Search the parsed barcodes for any ASNs. logger.debug(f"Starting new document at idx {idx}")
The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX current_document = []
is considered the ASN to be used. documents.append(current_document)
Returns the detected ASN (or None) keep_page = pages_to_split_on[idx]
""" if keep_page:
asn = None # Keep the page
# (new document is started by asn barcode)
current_document.append(page)
# get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX documents = [x for x in documents if len(x)]
asn_text = next(
(x.value for x in barcodes if x.is_asn),
None,
)
if asn_text: logger.debug(f"Split into {len(documents)} new documents")
logger.debug(f"Found ASN Barcode: {asn_text}")
# remove the prefix and remove whitespace
asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
# now, try parsing the ASN number # Write the new documents out
try: for doc_idx, document in enumerate(documents):
asn = int(asn_text) dst = Pdf.new()
except ValueError as e: dst.pages.extend(document)
logger.warning(f"Failed to parse ASN number because: {e}")
return asn output_filename = f"{fname}_document_{doc_idx}.pdf"
logger.debug(f"pdf no:{doc_idx} has {len(dst.pages)} pages")
savepath = Path(self.temp_dir.name) / output_filename
with open(savepath, "wb") as out:
dst.save(out)
document_paths.append(savepath)
def separate_pages(filepath: Path, pages_to_split_on: Dict[int, bool]) -> List[Path]: return document_paths
"""
Separate the provided pdf file on the pages_to_split_on.
The pages which are defined by the keys in page_numbers
will be removed if the corresponding value is false.
Returns a list of (temporary) filepaths to consume.
These will need to be deleted later.
"""
document_paths = [] def separate(
self,
source: DocumentSource,
override_name: Optional[str] = None,
) -> bool:
"""
Separates the document, based on barcodes and configuration, creating new
documents as required in the appropriate location.
if not pages_to_split_on: Returns True if a split happened, False otherwise
logger.warning("No pages to split on!") """
return document_paths # Do nothing
if not self.supported_mime_type:
logger.warning(f"Unsupported file format for barcode reader: {self.mime}")
return False
os.makedirs(settings.SCRATCH_DIR, exist_ok=True) # Does nothing unless needed
tempdir = Path(tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)) self.convert_from_tiff_to_pdf()
fname = filepath.with_suffix("").name
pdf = Pdf.open(filepath)
# Start with an empty document # Actually read the codes, if any
current_document: List[Page] = [] self.detect()
# A list of documents, ie a list of lists of pages
documents: List[List[Page]] = [current_document]
for idx, page in enumerate(pdf.pages): separator_pages = self.get_separation_pages()
# Keep building the new PDF as long as it is not a
# separator index
if idx not in pages_to_split_on:
current_document.append(page)
continue
# This is a split index # Also do nothing
# Start a new destination page listing if not separator_pages:
logger.debug(f"Starting new document at idx {idx}") logger.warning("No pages to split on!")
current_document = [] return False
documents.append(current_document)
keep_page = pages_to_split_on[idx]
if keep_page:
# Keep the page
# (new document is started by asn barcode)
current_document.append(page)
documents = [x for x in documents if len(x)] # Create the split documents
doc_paths = self.separate_pages(separator_pages)
logger.debug(f"Split into {len(documents)} new documents") # Save the new documents to correct folder
if source != DocumentSource.ConsumeFolder:
# The given file is somewhere in SCRATCH_DIR,
# and new documents must be moved to the CONSUMPTION_DIR
# for the consumer to notice them
save_to_dir = settings.CONSUMPTION_DIR
else:
# The given file is somewhere in CONSUMPTION_DIR,
# and may be some levels down for recursive tagging
# so use the file's parent to preserve any metadata
save_to_dir = self.file.parent
# Write the new documents out for idx, document_path in enumerate(doc_paths):
for doc_idx, document in enumerate(documents): if override_name is not None:
dst = Pdf.new() newname = f"{str(idx)}_{override_name}"
dst.pages.extend(document) dest = save_to_dir / newname
else:
output_filename = f"{fname}_document_{doc_idx}.pdf" dest = save_to_dir
logger.info(f"Saving {document_path} to {dest}")
logger.debug(f"pdf no:{doc_idx} has {len(dst.pages)} pages") shutil.copy2(document_path, dest)
savepath = tempdir / output_filename return True
with open(savepath, "wb") as out:
dst.save(out)
document_paths.append(savepath)
return document_paths
def save_to_dir(
filepath: Path,
newname: str = None,
target_dir: Path = settings.CONSUMPTION_DIR,
):
"""
Copies filepath to target_dir.
Optionally rename the file.
"""
if filepath.is_file() and target_dir.is_dir():
dest = target_dir
if newname is not None:
dest = dest / newname
shutil.copy(filepath, dest)
logging.debug(f"saved {str(filepath)} to {str(dest)}")
else:
logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.")

View File

@ -16,16 +16,15 @@ from filelock import FileLock
from redis.exceptions import ConnectionError from redis.exceptions import ConnectionError
from whoosh.writing import AsyncWriter from whoosh.writing import AsyncWriter
from documents import barcodes
from documents import index from documents import index
from documents import sanity_checker from documents import sanity_checker
from documents.barcodes import BarcodeReader
from documents.classifier import DocumentClassifier from documents.classifier import DocumentClassifier
from documents.classifier import load_classifier from documents.classifier import load_classifier
from documents.consumer import Consumer from documents.consumer import Consumer
from documents.consumer import ConsumerError from documents.consumer import ConsumerError
from documents.data_models import ConsumableDocument from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
from documents.file_handling import create_source_path_directory from documents.file_handling import create_source_path_directory
from documents.file_handling import generate_unique_filename from documents.file_handling import generate_unique_filename
from documents.models import Correspondent from documents.models import Correspondent
@ -96,95 +95,39 @@ def consume_file(
# read all barcodes in the current document # read all barcodes in the current document
if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE: if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE:
doc_barcode_info = barcodes.scan_file_for_barcodes( with BarcodeReader(input_doc.original_file, input_doc.mime_type) as reader:
input_doc.original_file, if settings.CONSUMER_ENABLE_BARCODES and reader.separate(
input_doc.mime_type, input_doc.source,
) overrides.filename,
):
# notify the sender, otherwise the progress bar
# in the UI stays stuck
payload = {
"filename": overrides.filename or input_doc.original_file.name,
"task_id": None,
"current_progress": 100,
"max_progress": 100,
"status": "SUCCESS",
"message": "finished",
}
try:
async_to_sync(get_channel_layer().group_send)(
"status_updates",
{"type": "status_update", "data": payload},
)
except ConnectionError as e:
logger.warning(f"ConnectionError on status send: {str(e)}")
# consuming stops here, since the original document with
# the barcodes has been split and will be consumed separately
# split document by separator pages, if enabled input_doc.original_file.unlink()
if settings.CONSUMER_ENABLE_BARCODES: return "File successfully split"
separators = barcodes.get_separating_barcodes(doc_barcode_info.barcodes)
if len(separators) > 0: # try reading the ASN from barcode
logger.debug( if settings.CONSUMER_ENABLE_ASN_BARCODE:
f"Pages with separators found in: {input_doc.original_file}", overrides.asn = reader.asn
) if overrides.asn:
document_list = barcodes.separate_pages( logger.info(f"Found ASN in barcode: {overrides.asn}")
doc_barcode_info.pdf_path,
separators,
)
if document_list:
# If the file is an upload, it's in the scratch directory
# Move it to consume directory to be picked up
# Otherwise, use the current parent to keep possible tags
# from subdirectories
if input_doc.source != DocumentSource.ConsumeFolder:
save_to_dir = settings.CONSUMPTION_DIR
else:
# Note this uses the original file, because it's in the
# consume folder already and may include additional path
# components for tagging
# the .path is somewhere in scratch in this case
save_to_dir = input_doc.original_file.parent
for n, document in enumerate(document_list):
# save to consumption dir
# rename it to the original filename with number prefix
if overrides.filename is not None:
newname = f"{str(n)}_{overrides.filename}"
else:
newname = None
barcodes.save_to_dir(
document,
newname=newname,
target_dir=save_to_dir,
)
# Split file has been copied safely, remove it
document.unlink()
# And clean up the directory as well, now it's empty
shutil.rmtree(document_list[0].parent)
# This file has been split into multiple files without issue
# remove the original and working copy
input_doc.original_file.unlink()
# If the original file was a TIFF, remove the PDF generated from it
if input_doc.mime_type == "image/tiff":
logger.debug(
f"Deleting file {doc_barcode_info.pdf_path}",
)
doc_barcode_info.pdf_path.unlink()
# notify the sender, otherwise the progress bar
# in the UI stays stuck
payload = {
"filename": overrides.filename or input_doc.original_file.name,
"task_id": None,
"current_progress": 100,
"max_progress": 100,
"status": "SUCCESS",
"message": "finished",
}
try:
async_to_sync(get_channel_layer().group_send)(
"status_updates",
{"type": "status_update", "data": payload},
)
except ConnectionError as e:
logger.warning(f"ConnectionError on status send: {str(e)}")
# consuming stops here, since the original document with
# the barcodes has been split and will be consumed separately
return "File successfully split"
# try reading the ASN from barcode
if settings.CONSUMER_ENABLE_ASN_BARCODE:
overrides.asn = barcodes.get_asn_from_barcodes(doc_barcode_info.barcodes)
if overrides.asn:
logger.info(f"Found ASN in barcode: {overrides.asn}")
# continue with consumption if no barcode was found # continue with consumption if no barcode was found
document = Consumer().try_consume_file( document = Consumer().try_consume_file(

Binary file not shown.

Before

Width:  |  Height:  |  Size: 836 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 9.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 891 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 337 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.4 KiB

File diff suppressed because it is too large Load Diff