Files
paperless-ngx/src/documents/barcodes.py

544 lines
19 KiB
Python

from __future__ import annotations
import logging
import re
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING
from django.conf import settings
from pdf2image import convert_from_path
from pikepdf import Page
from pikepdf import PasswordError
from pikepdf import Pdf
from documents.converters import convert_from_tiff_to_pdf
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.models import Document
from documents.models import Tag
from documents.plugins.base import ConsumeTaskPlugin
from documents.plugins.base import StopConsumeTaskError
from documents.plugins.helpers import ProgressManager
from documents.plugins.helpers import ProgressStatusOptions
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
from documents.utils import maybe_override_pixel_limit
from paperless.config import BarcodeConfig
if TYPE_CHECKING:
from collections.abc import Callable
from PIL import Image
logger = logging.getLogger("paperless.barcodes")
@dataclass(frozen=True)
class Barcode:
"""
Holds the information about a single barcode and its location in a document
"""
page: int
value: str
settings: BarcodeConfig
@property
def is_separator(self) -> bool:
"""
Returns True if the barcode value equals the configured separation value,
False otherwise
"""
return self.value == self.settings.barcode_string
@property
def is_asn(self) -> bool:
"""
Returns True if the barcode value matches the configured ASN prefix,
False otherwise
"""
return self.value.startswith(self.settings.barcode_asn_prefix)
@property
def is_tag(self) -> bool:
"""
Returns True if the barcode value matches any configured tag mapping pattern,
False otherwise.
Note: This does NOT exclude ASN or separator barcodes - they can also be used
as tags if they match a tag mapping pattern (e.g., {"ASN12.*": "JOHN"}).
"""
for regex in self.settings.barcode_tag_mapping:
if re.match(regex, self.value, flags=re.IGNORECASE):
return True
return False
class BarcodePlugin(ConsumeTaskPlugin):
NAME: str = "BarcodePlugin"
@property
def able_to_run(self) -> bool:
"""
Able to run if:
- ASN from barcode detection is enabled or
- Barcode support is enabled and the mime type is supported
"""
if self.settings.barcode_enable_tiff_support:
supported_mimes: set[str] = {"application/pdf", "image/tiff"}
else:
supported_mimes = {"application/pdf"}
return (
self.settings.barcode_enable_asn
or self.settings.barcodes_enabled
or self.settings.barcode_enable_tag
) and self.input_doc.mime_type in supported_mimes
def get_settings(self) -> BarcodeConfig:
"""
Returns the settings for this plugin (Django settings or app config)
"""
return BarcodeConfig()
def __init__(
self,
input_doc: ConsumableDocument,
metadata: DocumentMetadataOverrides,
status_mgr: ProgressManager,
base_tmp_dir: Path,
task_id: str,
) -> None:
super().__init__(
input_doc,
metadata,
status_mgr,
base_tmp_dir,
task_id,
)
# need these for able_to_run
self.settings = self.get_settings()
def setup(self) -> None:
self.temp_dir = tempfile.TemporaryDirectory(
dir=self.base_tmp_dir,
prefix="barcode",
)
self.pdf_file: Path = self.input_doc.original_file
self._tiff_conversion_done = False
self.barcodes: list[Barcode] = []
def _apply_detected_asn(self, detected_asn: int) -> None:
"""
Apply a detected ASN to metadata if allowed.
"""
if (
self.metadata.skip_asn_if_exists
and Document.global_objects.filter(
archive_serial_number=detected_asn,
).exists()
):
logger.info(
f"Found ASN in barcode {detected_asn} but skipping because it already exists.",
)
return
logger.info(f"Found ASN in barcode: {detected_asn}")
self.metadata.asn = detected_asn
def run(self) -> None:
# Some operations may use PIL, override pixel setting if needed
maybe_override_pixel_limit()
# Maybe do the conversion of TIFF to PDF
self.convert_from_tiff_to_pdf()
# Locate any barcodes in the files
self.detect()
# try reading tags from barcodes
# If tag splitting is enabled, skip this on the original document - let each split document extract its own tags
# However, if we're processing a split document (original_path is set), extract tags
if (
self.settings.barcode_enable_tag
and (
not self.settings.barcode_tag_split
or self.input_doc.original_path is not None
)
and (tags := self.tags) is not None
and len(tags) > 0
):
if self.metadata.tag_ids:
self.metadata.tag_ids += tags
else:
self.metadata.tag_ids = tags
logger.info(f"Found tags in barcode: {tags}")
# Lastly attempt to split documents
if self.settings.barcodes_enabled and (
separator_pages := self.get_separation_pages()
):
# We have pages to split against
# Note this does NOT use the base_temp_dir, as that will be removed
tmp_dir = Path(
tempfile.mkdtemp(
dir=settings.SCRATCH_DIR,
prefix="paperless-barcode-split-",
),
).resolve()
from documents import tasks
# Create the split document tasks
for new_document in self.separate_pages(separator_pages):
copy_file_with_basic_stats(new_document, tmp_dir / new_document.name)
task = tasks.consume_file.delay(
ConsumableDocument(
# Same source, for templates
source=self.input_doc.source,
mailrule_id=self.input_doc.mailrule_id,
# Can't use same folder or the consume might grab it again
original_file=(tmp_dir / new_document.name).resolve(),
# Adding optional original_path for later uses in
# workflow matching
original_path=self.input_doc.original_file,
),
# All the same metadata
self.metadata,
)
logger.info(f"Created new task {task.id} for {new_document.name}")
# This file is now two or more files
self.input_doc.original_file.unlink()
msg = "Barcode splitting complete!"
# Update the progress to complete
self.status_mgr.send_progress(ProgressStatusOptions.SUCCESS, msg, 100, 100)
# Request the consume task stops
raise StopConsumeTaskError(msg)
# Update/overwrite an ASN if possible
# After splitting, as otherwise each split document gets the same ASN
if self.settings.barcode_enable_asn and (located_asn := self.asn) is not None:
self._apply_detected_asn(located_asn)
def cleanup(self) -> None:
self.temp_dir.cleanup()
def convert_from_tiff_to_pdf(self) -> None:
"""
May convert a TIFF image into a PDF, if the input is a TIFF and
the TIFF has not been made into a PDF
"""
# Nothing to do, pdf_file is already assigned correctly
if self.input_doc.mime_type != "image/tiff" or self._tiff_conversion_done:
return
self.pdf_file = convert_from_tiff_to_pdf(
self.input_doc.original_file,
Path(self.temp_dir.name),
)
self._tiff_conversion_done = True
@staticmethod
def read_barcodes_zxing(image: Image.Image) -> list[str]:
barcodes = []
import zxingcpp
detected_barcodes = zxingcpp.read_barcodes(image)
for barcode in detected_barcodes:
if barcode.text:
barcodes.append(barcode.text)
logger.debug(
f"Barcode of type {barcode.format} found: {barcode.text}",
)
return barcodes
@staticmethod
def read_barcodes_pyzbar(image: Image.Image) -> list[str]:
barcodes = []
from pyzbar import pyzbar
# Decode the barcode image
detected_barcodes = pyzbar.decode(image)
# Traverse through all the detected barcodes in image
for barcode in detected_barcodes:
if barcode.data:
decoded_barcode = barcode.data.decode("utf-8")
barcodes.append(decoded_barcode)
logger.debug(
f"Barcode of type {barcode.type} found: {decoded_barcode}",
)
return barcodes
def detect(self) -> None:
"""
Scan all pages of the PDF as images, updating barcodes and the pages
found on as we go
"""
# Bail if barcodes already exist
if self.barcodes:
return
# No op if not a TIFF
self.convert_from_tiff_to_pdf()
# Choose the library for reading
if settings.CONSUMER_BARCODE_SCANNER == "PYZBAR":
reader: Callable[[Image.Image], list[str]] = self.read_barcodes_pyzbar
logger.debug("Scanning for barcodes using PYZBAR")
else:
reader = self.read_barcodes_zxing
logger.debug("Scanning for barcodes using ZXING")
try:
# Read number of pages from pdf
with Pdf.open(self.pdf_file) as pdf:
num_of_pages = len(pdf.pages)
logger.debug(f"PDF has {num_of_pages} pages")
# Get limit from configuration
barcode_max_pages: int = (
num_of_pages
if self.settings.barcode_max_pages == 0
else self.settings.barcode_max_pages
)
if barcode_max_pages < num_of_pages: # pragma: no cover
logger.debug(
f"Barcodes detection will be limited to the first {barcode_max_pages} pages",
)
# Loop al page
for current_page_number in range(min(num_of_pages, barcode_max_pages)):
logger.debug(f"Processing page {current_page_number}")
# Convert page to image
page = convert_from_path(
self.pdf_file,
dpi=self.settings.barcode_dpi,
output_folder=self.temp_dir.name,
first_page=current_page_number + 1,
last_page=current_page_number + 1,
)[0]
# Remember filename, since it is lost by upscaling
page_filepath = Path(page.filename)
logger.debug(f"Image is at {page_filepath}")
# Upscale image if configured
factor = self.settings.barcode_upscale
if factor > 1.0:
logger.debug(
f"Upscaling image by {factor} for better barcode detection",
)
x, y = page.size
page = page.resize(
(round(x * factor), (round(y * factor))),
)
# Detect barcodes
for barcode_value in reader(page):
self.barcodes.append(
Barcode(current_page_number, barcode_value, self.settings),
)
# Delete temporary image file
page_filepath.unlink()
# Password protected files can't be checked
# This is the exception raised for those
except PasswordError as e:
logger.warning(
f"File is likely password protected, not checking for barcodes: {e}",
)
# This file is really borked, allow the consumption to continue
# but it may fail further on
except Exception as e: # pragma: no cover
logger.warning(
f"Exception during barcode scanning: {e}",
)
@property
def asn(self) -> int | None:
"""
Search the parsed barcodes for any ASNs.
The first barcode that starts with barcode_asn_prefix
is considered the ASN to be used.
Returns the detected ASN (or None)
"""
asn = None
# Ensure the barcodes have been read
self.detect()
# get the first barcode that starts with barcode_asn_prefix
asn_text: str | None = next(
(x.value for x in self.barcodes if x.is_asn),
None,
)
if asn_text:
logger.debug(f"Found ASN Barcode: {asn_text}")
# remove the prefix and remove whitespace
asn_text = asn_text[len(self.settings.barcode_asn_prefix) :].strip()
# remove non-numeric parts of the remaining string
asn_text = re.sub(r"\D", "", asn_text)
# now, try parsing the ASN number
try:
asn = int(asn_text)
except ValueError as e:
logger.warning(f"Failed to parse ASN number because: {e}")
return asn
@property
def tags(self) -> list[int]:
"""
Search the parsed barcodes for any tags.
Returns the detected tag ids (or empty list)
"""
tags: list[int] = []
# Ensure the barcodes have been read
self.detect()
for x in self.barcodes:
tag_texts: str = x.value
for raw in tag_texts.split(","):
try:
tag_str: str | None = None
for regex in self.settings.barcode_tag_mapping:
if re.match(regex, raw, flags=re.IGNORECASE):
sub = self.settings.barcode_tag_mapping[regex]
tag_str = (
re.sub(regex, sub, raw, flags=re.IGNORECASE)
if sub
else raw
)
break
if tag_str:
tag, _ = Tag.objects.get_or_create(
name__iexact=tag_str,
defaults={"name": tag_str},
)
logger.debug(
f"Found Tag Barcode '{raw}', substituted "
f"to '{tag}' and mapped to "
f"tag #{tag.pk}.",
)
tags.append(tag.pk)
except Exception as e:
logger.error(
f"Failed to find or create TAG '{raw}' because: {e}",
)
return tags
def get_separation_pages(self) -> dict[int, bool]:
"""
Search the parsed barcodes for separators and returns a dict of page
numbers, which separate the file into new files, together with the
information whether to keep the page.
"""
# filter all barcodes for the separator string
# get the page numbers of the separating barcodes
retain = self.settings.barcode_retain_split_pages
separator_pages = {
bc.page: retain
for bc in self.barcodes
if bc.is_separator and (not retain or (retain and bc.page > 0))
} # as below, dont include the first page if retain is enabled
# add the page numbers of the ASN barcodes
# (except for first page, that might lead to infinite loops).
if self.settings.barcode_enable_asn:
separator_pages = {
**separator_pages,
**{bc.page: True for bc in self.barcodes if bc.is_asn and bc.page != 0},
}
# add the page numbers of the TAG barcodes if splitting is enabled
# (except for first page, that might lead to infinite loops).
if self.settings.barcode_tag_split and self.settings.barcode_enable_tag:
separator_pages = {
**separator_pages,
**{bc.page: True for bc in self.barcodes if bc.is_tag and bc.page != 0},
}
return separator_pages
def separate_pages(self, pages_to_split_on: dict[int, bool]) -> list[Path]:
"""
Separate the provided pdf file on the pages_to_split_on.
The pages which are defined by the keys in page_numbers
will be removed if the corresponding value is false.
Returns a list of (temporary) filepaths to consume.
These will need to be deleted later.
"""
document_paths = []
fname: str = self.input_doc.original_file.stem
with Pdf.open(self.pdf_file) as input_pdf:
# Start with an empty document
current_document: list[Page] = []
# A list of documents, ie a list of lists of pages
documents: list[list[Page]] = [current_document]
for idx, page in enumerate(input_pdf.pages):
# Keep building the new PDF as long as it is not a
# separator index
if idx not in pages_to_split_on:
current_document.append(page)
continue
# This is a split index
# Start a new destination page listing
logger.debug(f"Starting new document at idx {idx}")
current_document = []
documents.append(current_document)
keep_page: bool = pages_to_split_on[idx]
if keep_page:
# Keep the page
# (new document is started by asn barcode)
current_document.append(page)
documents = [x for x in documents if len(x)]
logger.debug(f"Split into {len(documents)} new documents")
# Write the new documents out
for doc_idx, document in enumerate(documents):
dst = Pdf.new()
dst.pages.extend(document)
output_filename = f"{fname}_document_{doc_idx}.pdf"
logger.debug(f"pdf no:{doc_idx} has {len(dst.pages)} pages")
savepath = Path(self.temp_dir.name) / output_filename
with savepath.open("wb") as out:
dst.save(out)
copy_basic_file_stats(self.input_doc.original_file, savepath)
document_paths.append(savepath)
return document_paths