paperless-ngx/src/documents/tasks.py

import logging
import os
import shutil
import tempfile
from typing import List  # for type hinting. Can be removed, if only Python >3.8 is used

import magic
import tqdm
from asgiref.sync import async_to_sync
from channels.layers import get_channel_layer
from django.conf import settings
from django.db.models.signals import post_save
from documents import index
from documents import sanity_checker
from documents.classifier import DocumentClassifier
from documents.classifier import load_classifier
from documents.consumer import Consumer
from documents.consumer import ConsumerError
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import StoragePath
from documents.models import Tag
from documents.sanity_checker import SanityCheckFailedException
from pdf2image import convert_from_path
from pikepdf import Pdf
from PIL import Image
from PIL import ImageSequence
from pyzbar import pyzbar
from whoosh.writing import AsyncWriter


logger = logging.getLogger("paperless.tasks")


def index_optimize():
    ix = index.open_index()
    writer = AsyncWriter(ix)
    writer.commit(optimize=True)


def index_reindex(progress_bar_disable=False):
    documents = Document.objects.all()

    ix = index.open_index(recreate=True)

    with AsyncWriter(ix) as writer:
        for document in tqdm.tqdm(documents, disable=progress_bar_disable):
            index.update_document(writer, document)


def train_classifier():
    if (
        not Tag.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
        and not DocumentType.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
        and not Correspondent.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
        and not StoragePath.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
    ):

        return

    classifier = load_classifier()

    if not classifier:
        classifier = DocumentClassifier()

    try:
        if classifier.train():
            logger.info(
                f"Saving updated classifier model to {settings.MODEL_FILE}...",
            )
            classifier.save()
        else:
            logger.debug("Training data unchanged.")

    except Exception as e:
        logger.warning("Classifier error: " + str(e))


def barcode_reader(image) -> List[str]:
    """
    Read any barcodes contained in image
    Returns a list containing all found barcodes
    """
    barcodes = []
    # Decode the barcode image
    detected_barcodes = pyzbar.decode(image)

    if detected_barcodes:
        # Traverse through all the detected barcodes in image
        for barcode in detected_barcodes:
            if barcode.data:
                decoded_barcode = barcode.data.decode("utf-8")
                barcodes.append(decoded_barcode)
                logger.debug(
                    f"Barcode of type {str(barcode.type)} found: {decoded_barcode}",
                )
    return barcodes


def get_file_type(path: str) -> str:
    """
    Determines the file type, based on MIME type.

    Returns the MIME type.
    """
    mime_type = magic.from_file(path, mime=True)
    logger.debug(f"Detected mime type: {mime_type}")
    return mime_type


def convert_from_tiff_to_pdf(filepath: str) -> str:
    """
    converts a given TIFF image file to pdf into a temporary directory.

    Returns the new pdf file.
    """
    file_name = os.path.splitext(os.path.basename(filepath))[0]
    mime_type = get_file_type(filepath)
    tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
    # use old file name with pdf extension
    if mime_type == "image/tiff":
        newpath = os.path.join(tempdir, file_name + ".pdf")
    else:
        logger.warning(
            f"Cannot convert mime type {str(mime_type)} from {str(filepath)} to pdf.",
        )
        return None
    with Image.open(filepath) as image:
        images = []
        for i, page in enumerate(ImageSequence.Iterator(image)):
            page = page.convert("RGB")
            images.append(page)
        try:
            if len(images) == 1:
                images[0].save(newpath)
            else:
                images[0].save(newpath, save_all=True, append_images=images[1:])
        except OSError as e:
            logger.warning(
                f"Could not save the file as pdf. Error: {str(e)}",
            )
            return None
    return newpath


def scan_file_for_separating_barcodes(filepath: str) -> List[int]:
    """
    Scan the provided pdf file for page separating barcodes
    Returns a list of pagenumbers, which separate the file
    """
    separator_page_numbers = []
    separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
    # use a temporary directory in case the file os too big to handle in memory
    with tempfile.TemporaryDirectory() as path:
        pages_from_path = convert_from_path(filepath, output_folder=path)
        for current_page_number, page in enumerate(pages_from_path):
            current_barcodes = barcode_reader(page)
            if separator_barcode in current_barcodes:
                separator_page_numbers.append(current_page_number)
    return separator_page_numbers


def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
    """
    Separate the provided pdf file on the pages_to_split_on.
    The pages which are defined by page_numbers will be removed.
    Returns a list of (temporary) filepaths to consume.
    These will need to be deleted later.
    """
    os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
    tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
    fname = os.path.splitext(os.path.basename(filepath))[0]
    pdf = Pdf.open(filepath)
    document_paths = []
    logger.debug(f"Temp dir is {str(tempdir)}")
    if not pages_to_split_on:
        logger.warning("No pages to split on!")
    else:
        # go from the first page to the first separator page
        dst = Pdf.new()
        for n, page in enumerate(pdf.pages):
            if n < pages_to_split_on[0]:
                dst.pages.append(page)
        output_filename = f"{fname}_document_0.pdf"
        savepath = os.path.join(tempdir, output_filename)
        with open(savepath, "wb") as out:
            dst.save(out)
        document_paths = [savepath]

        # iterate through the rest of the document
        for count, page_number in enumerate(pages_to_split_on):
            logger.debug(f"Count: {str(count)} page_number: {str(page_number)}")
            dst = Pdf.new()
            try:
                next_page = pages_to_split_on[count + 1]
            except IndexError:
                next_page = len(pdf.pages)
            # skip the first page_number. This contains the barcode page
            for page in range(page_number + 1, next_page):
                logger.debug(
                    f"page_number: {str(page_number)} next_page: {str(next_page)}",
                )
                dst.pages.append(pdf.pages[page])
            output_filename = f"{fname}_document_{str(count + 1)}.pdf"
            logger.debug(f"pdf no:{str(count)} has {str(len(dst.pages))} pages")
            savepath = os.path.join(tempdir, output_filename)
            with open(savepath, "wb") as out:
                dst.save(out)
            document_paths.append(savepath)
    logger.debug(f"Temp files are {str(document_paths)}")
    return document_paths


def save_to_dir(
    filepath: str,
    newname: str = None,
    target_dir: str = settings.CONSUMPTION_DIR,
):
    """
    Copies filepath to target_dir.
    Optionally rename the file.
    """
    if os.path.isfile(filepath) and os.path.isdir(target_dir):
        dst = shutil.copy(filepath, target_dir)
        logging.debug(f"saved {str(filepath)} to {str(dst)}")
        if newname:
            dst_new = os.path.join(target_dir, newname)
            logger.debug(f"moving {str(dst)} to {str(dst_new)}")
            os.rename(dst, dst_new)
    else:
        logger.warning(f"{str(filepath)} or {str(target_dir)} don't exist.")


def consume_file(
    path,
    override_filename=None,
    override_title=None,
    override_correspondent_id=None,
    override_document_type_id=None,
    override_tag_ids=None,
    task_id=None,
    override_created=None,
):

    # check for separators in current document
    if settings.CONSUMER_ENABLE_BARCODES:
        separators = []
        document_list = []
        converted_tiff = None
        if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
            supported_mime = ["image/tiff", "application/pdf"]
        else:
            supported_mime = ["application/pdf"]
        mime_type = get_file_type(path)
        if mime_type not in supported_mime:
            # if not supported, skip this routine
            logger.warning(
                f"Unsupported file format for barcode reader: {str(mime_type)}",
            )
        else:
            if mime_type == "image/tiff":
                file_to_process = convert_from_tiff_to_pdf(path)
            else:
                file_to_process = path

            separators = scan_file_for_separating_barcodes(file_to_process)

            if separators:
                logger.debug(
                    f"Pages with separators found in: {str(path)}",
                )
                document_list = separate_pages(file_to_process, separators)

            if document_list:
                for n, document in enumerate(document_list):
                    # save to consumption dir
                    # rename it to the original filename  with number prefix
                    if override_filename:
                        newname = f"{str(n)}_" + override_filename
                    else:
                        newname = None
                    save_to_dir(document, newname=newname)
                # if we got here, the document was successfully split
                # and can safely be deleted
                if converted_tiff:
                    logger.debug(f"Deleting file {file_to_process}")
                    os.unlink(file_to_process)
                logger.debug(f"Deleting file {path}")
                os.unlink(path)
                # notify the sender, otherwise the progress bar
                # in the UI stays stuck
                payload = {
                    "filename": override_filename,
                    "task_id": task_id,
                    "current_progress": 100,
                    "max_progress": 100,
                    "status": "SUCCESS",
                    "message": "finished",
                }
                try:
                    async_to_sync(get_channel_layer().group_send)(
                        "status_updates",
                        {"type": "status_update", "data": payload},
                    )
                except OSError as e:
                    logger.warning(
                        "OSError. It could be, the broker cannot be reached.",
                    )
                    logger.warning(str(e))
                # consuming stops here, since the original document with
                # the barcodes has been split and will be consumed separately
                return "File successfully split"

    # continue with consumption if no barcode was found
    document = Consumer().try_consume_file(
        path,
        override_filename=override_filename,
        override_title=override_title,
        override_correspondent_id=override_correspondent_id,
        override_document_type_id=override_document_type_id,
        override_tag_ids=override_tag_ids,
        task_id=task_id,
        override_created=override_created,
    )

    if document:
        return f"Success. New document id {document.pk} created"
    else:
        raise ConsumerError(
            "Unknown error: Returned document was null, but "
            "no error message was given.",
        )


def sanity_check():
    messages = sanity_checker.check_sanity()

    messages.log_messages()

    if messages.has_error:
        raise SanityCheckFailedException("Sanity check failed with errors. See log.")
    elif messages.has_warning:
        return "Sanity check exited with warnings. See log."
    elif len(messages) > 0:
        return "Sanity check exited with infos. See log."
    else:
        return "No issues detected."


def bulk_update_documents(document_ids):
    documents = Document.objects.filter(id__in=document_ids)

    ix = index.open_index()

    for doc in documents:
        post_save.send(Document, instance=doc, created=False)

    with AsyncWriter(ix) as writer:
        for doc in documents:
            index.update_document(writer, doc)