Refactor file consumption task to allow beginnings of a plugin system (#5367)

This commit is contained in:
Trenton H
2024-01-13 08:11:14 -08:00
committed by GitHub
parent 4dbf8d7969
commit 2da5e46386
11 changed files with 767 additions and 531 deletions

View File

@@ -3,7 +3,6 @@ import re
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import Final
from typing import Optional
from django.conf import settings
@@ -15,8 +14,9 @@ from PIL import Image
from documents.converters import convert_from_tiff_to_pdf
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.data_models import DocumentSource
from documents.plugins.base import ConsumeTaskPlugin
from documents.plugins.base import StopConsumeTaskError
from documents.plugins.helpers import ProgressStatusOptions
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
@@ -26,7 +26,7 @@ logger = logging.getLogger("paperless.barcodes")
@dataclass(frozen=True)
class Barcode:
"""
Holds the information about a single barcode and its location
Holds the information about a single barcode and its location in a document
"""
page: int
@@ -49,77 +49,111 @@ class Barcode:
return self.value.startswith(settings.CONSUMER_ASN_BARCODE_PREFIX)
class BarcodeReader:
def __init__(self, filepath: Path, mime_type: str) -> None:
self.file: Final[Path] = filepath
self.mime: Final[str] = mime_type
self.pdf_file: Path = self.file
self.barcodes: list[Barcode] = []
self._tiff_conversion_done = False
self.temp_dir: Optional[tempfile.TemporaryDirectory] = None
class BarcodePlugin(ConsumeTaskPlugin):
NAME: str = "BarcodePlugin"
@property
def able_to_run(self) -> bool:
"""
Able to run if:
- ASN from barcode detection is enabled or
- Barcode support is enabled and the mime type is supported
"""
if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
self.SUPPORTED_FILE_MIMES = {"application/pdf", "image/tiff"}
supported_mimes = {"application/pdf", "image/tiff"}
else:
self.SUPPORTED_FILE_MIMES = {"application/pdf"}
supported_mimes = {"application/pdf"}
def __enter__(self):
if self.supported_mime_type:
self.temp_dir = tempfile.TemporaryDirectory(prefix="paperless-barcodes")
return self
return (
settings.CONSUMER_ENABLE_ASN_BARCODE or settings.CONSUMER_ENABLE_BARCODES
) and self.input_doc.mime_type in supported_mimes
def __exit__(self, exc_type, exc_val, exc_tb):
if self.temp_dir is not None:
self.temp_dir.cleanup()
self.temp_dir = None
def setup(self):
self.temp_dir = tempfile.TemporaryDirectory(
dir=self.base_tmp_dir,
prefix="barcode",
)
self.pdf_file = self.input_doc.original_file
self._tiff_conversion_done = False
self.barcodes: list[Barcode] = []
@property
def supported_mime_type(self) -> bool:
"""
Return True if the given mime type is supported for barcodes, false otherwise
"""
return self.mime in self.SUPPORTED_FILE_MIMES
def run(self) -> Optional[str]:
# Maybe do the conversion of TIFF to PDF
self.convert_from_tiff_to_pdf()
@property
def asn(self) -> Optional[int]:
"""
Search the parsed barcodes for any ASNs.
The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
is considered the ASN to be used.
Returns the detected ASN (or None)
"""
asn = None
if not self.supported_mime_type:
return None
# Ensure the barcodes have been read
# Locate any barcodes in the files
self.detect()
# get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
asn_text = next(
(x.value for x in self.barcodes if x.is_asn),
None,
# Update/overwrite an ASN if possible
located_asn = self.asn
if located_asn is not None:
logger.info(f"Found ASN in barcode: {located_asn}")
self.metadata.asn = located_asn
separator_pages = self.get_separation_pages()
if not separator_pages:
return "No pages to split on!"
# We have pages to split against
# Note this does NOT use the base_temp_dir, as that will be removed
tmp_dir = Path(
tempfile.mkdtemp(
dir=settings.SCRATCH_DIR,
prefix="paperless-barcode-split-",
),
).resolve()
from documents import tasks
# Create the split document tasks
for new_document in self.separate_pages(separator_pages):
copy_file_with_basic_stats(new_document, tmp_dir / new_document.name)
task = tasks.consume_file.delay(
ConsumableDocument(
# Same source, for templates
source=self.input_doc.source,
mailrule_id=self.input_doc.mailrule_id,
# Can't use same folder or the consume might grab it again
original_file=(tmp_dir / new_document.name).resolve(),
),
# All the same metadata
self.metadata,
)
logger.info(f"Created new task {task.id} for {new_document.name}")
# This file is now two or more files
self.input_doc.original_file.unlink()
msg = "Barcode splitting complete!"
# Update the progress to complete
self.status_mgr.send_progress(ProgressStatusOptions.SUCCESS, msg, 100, 100)
# Request the consume task stops
raise StopConsumeTaskError(msg)
def cleanup(self) -> None:
self.temp_dir.cleanup()
def convert_from_tiff_to_pdf(self):
"""
May convert a TIFF image into a PDF, if the input is a TIFF and
the TIFF has not been made into a PDF
"""
# Nothing to do, pdf_file is already assigned correctly
if self.input_doc.mime_type != "image/tiff" or self._tiff_conversion_done:
return
self.pdf_file = convert_from_tiff_to_pdf(
self.input_doc.original_file,
Path(self.temp_dir.name),
)
if asn_text:
logger.debug(f"Found ASN Barcode: {asn_text}")
# remove the prefix and remove whitespace
asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
# remove non-numeric parts of the remaining string
asn_text = re.sub(r"\D", "", asn_text)
# now, try parsing the ASN number
try:
asn = int(asn_text)
except ValueError as e:
logger.warning(f"Failed to parse ASN number because: {e}")
return asn
self._tiff_conversion_done = True
@staticmethod
def read_barcodes_zxing(image: Image) -> list[str]:
def read_barcodes_zxing(image: Image.Image) -> list[str]:
barcodes = []
import zxingcpp
@@ -135,7 +169,7 @@ class BarcodeReader:
return barcodes
@staticmethod
def read_barcodes_pyzbar(image: Image) -> list[str]:
def read_barcodes_pyzbar(image: Image.Image) -> list[str]:
barcodes = []
from pyzbar import pyzbar
@@ -154,18 +188,6 @@ class BarcodeReader:
return barcodes
def convert_from_tiff_to_pdf(self):
"""
May convert a TIFF image into a PDF, if the input is a TIFF and
the TIFF has not been made into a PDF
"""
# Nothing to do, pdf_file is already assigned correctly
if self.mime != "image/tiff" or self._tiff_conversion_done:
return
self._tiff_conversion_done = True
self.pdf_file = convert_from_tiff_to_pdf(self.file, Path(self.temp_dir.name))
def detect(self) -> None:
"""
Scan all pages of the PDF as images, updating barcodes and the pages
@@ -218,10 +240,45 @@ class BarcodeReader:
# This file is really borked, allow the consumption to continue
# but it may fail further on
except Exception as e: # pragma: no cover
logger.exception(
logger.warning(
f"Exception during barcode scanning: {e}",
)
@property
def asn(self) -> Optional[int]:
"""
Search the parsed barcodes for any ASNs.
The first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
is considered the ASN to be used.
Returns the detected ASN (or None)
"""
asn = None
# Ensure the barcodes have been read
self.detect()
# get the first barcode that starts with CONSUMER_ASN_BARCODE_PREFIX
asn_text = next(
(x.value for x in self.barcodes if x.is_asn),
None,
)
if asn_text:
logger.debug(f"Found ASN Barcode: {asn_text}")
# remove the prefix and remove whitespace
asn_text = asn_text[len(settings.CONSUMER_ASN_BARCODE_PREFIX) :].strip()
# remove non-numeric parts of the remaining string
asn_text = re.sub(r"\D", "", asn_text)
# now, try parsing the ASN number
try:
asn = int(asn_text)
except ValueError as e:
logger.warning(f"Failed to parse ASN number because: {e}")
return asn
def get_separation_pages(self) -> dict[int, bool]:
"""
Search the parsed barcodes for separators and returns a dict of page
@@ -251,7 +308,7 @@ class BarcodeReader:
"""
document_paths = []
fname = self.file.stem
fname = self.input_doc.original_file.stem
with Pdf.open(self.pdf_file) as input_pdf:
# Start with an empty document
current_document: list[Page] = []
@@ -292,58 +349,8 @@ class BarcodeReader:
with open(savepath, "wb") as out:
dst.save(out)
copy_basic_file_stats(self.file, savepath)
copy_basic_file_stats(self.input_doc.original_file, savepath)
document_paths.append(savepath)
return document_paths
def separate(
self,
source: DocumentSource,
overrides: DocumentMetadataOverrides,
) -> bool:
"""
Separates the document, based on barcodes and configuration, creating new
documents as required in the appropriate location.
Returns True if a split happened, False otherwise
"""
# Do nothing
if not self.supported_mime_type:
logger.warning(f"Unsupported file format for barcode reader: {self.mime}")
return False
# Does nothing unless needed
self.convert_from_tiff_to_pdf()
# Actually read the codes, if any
self.detect()
separator_pages = self.get_separation_pages()
# Also do nothing
if not separator_pages:
logger.warning("No pages to split on!")
return False
tmp_dir = Path(tempfile.mkdtemp(prefix="paperless-barcode-split-")).resolve()
from documents import tasks
# Create the split document tasks
for new_document in self.separate_pages(separator_pages):
copy_file_with_basic_stats(new_document, tmp_dir / new_document.name)
tasks.consume_file.delay(
ConsumableDocument(
# Same source, for templates
source=source,
# Can't use same folder or the consume might grab it again
original_file=(tmp_dir / new_document.name).resolve(),
),
# All the same metadata
overrides,
)
logger.info("Barcode splitting complete!")
return True