paperless-ngx/src/paperless_tika/parsers.py

from pathlib import Path

import httpx
from django.conf import settings
from django.utils import timezone
from gotenberg_client import GotenbergClient
from gotenberg_client.options import PdfAFormat
from tika_client import TikaClient

from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from paperless.config import OutputTypeConfig
from paperless.models import OutputTypeChoices


class TikaDocumentParser(DocumentParser):
    """
    This parser sends documents to a local tika server
    """

    logging_name = "paperless.parsing.tika"

    def get_thumbnail(self, document_path, mime_type, file_name=None):
        if not self.archive_path:
            self.archive_path = self.convert_to_pdf(document_path, file_name)

        return make_thumbnail_from_pdf(
            self.archive_path,
            self.tempdir,
            self.logging_group,
        )

    def extract_metadata(self, document_path, mime_type):
        try:
            with TikaClient(
                tika_url=settings.TIKA_ENDPOINT,
                timeout=settings.CELERY_TASK_TIME_LIMIT,
            ) as client:
                parsed = client.metadata.from_file(document_path, mime_type)
                return [
                    {
                        "namespace": "",
                        "prefix": "",
                        "key": key,
                        "value": parsed.data[key],
                    }
                    for key in parsed.data
                ]
        except Exception as e:
            self.log.warning(
                f"Error while fetching document metadata for {document_path}: {e}",
            )
            return []

    def parse(self, document_path: Path, mime_type: str, file_name=None):
        self.log.info(f"Sending {document_path} to Tika server")

        try:
            with TikaClient(
                tika_url=settings.TIKA_ENDPOINT,
                timeout=settings.CELERY_TASK_TIME_LIMIT,
            ) as client:
                try:
                    parsed = client.tika.as_text.from_file(document_path, mime_type)
                except httpx.HTTPStatusError as err:
                    # Workaround https://issues.apache.org/jira/browse/TIKA-4110
                    # Tika fails with some files as multi-part form data
                    if err.response.status_code == httpx.codes.INTERNAL_SERVER_ERROR:
                        parsed = client.tika.as_text.from_buffer(
                            document_path.read_bytes(),
                            mime_type,
                        )
                    else:  # pragma: no cover
                        raise
        except Exception as err:
            raise ParseError(
                f"Could not parse {document_path} with tika server at "
                f"{settings.TIKA_ENDPOINT}: {err}",
            ) from err

        self.text = parsed.content
        if self.text is not None:
            self.text = self.text.strip()

        self.date = parsed.created
        if self.date is not None and timezone.is_naive(self.date):
            self.date = timezone.make_aware(self.date)

        self.archive_path = self.convert_to_pdf(document_path, file_name)

    def convert_to_pdf(self, document_path: Path, file_name):
        pdf_path = Path(self.tempdir) / "convert.pdf"

        self.log.info(f"Converting {document_path} to PDF as {pdf_path}")

        with (
            GotenbergClient(
                host=settings.TIKA_GOTENBERG_ENDPOINT,
                timeout=settings.CELERY_TASK_TIME_LIMIT,
            ) as client,
            client.libre_office.to_pdf() as route,
        ):
            # Set the output format of the resulting PDF
            if settings.OCR_OUTPUT_TYPE in {
                OutputTypeChoices.PDF_A,
                OutputTypeChoices.PDF_A2,
            }:
                route.pdf_format(PdfAFormat.A2b)
            elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:
                self.log.warning(
                    "Gotenberg does not support PDF/A-1a, choosing PDF/A-2b instead",
                )
                route.pdf_format(PdfAFormat.A2b)
            elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:
                route.pdf_format(PdfAFormat.A3b)

            route.convert(document_path)

            try:
                response = route.run()

                pdf_path.write_bytes(response.content)

                return pdf_path

            except Exception as err:
                raise ParseError(
                    f"Error while converting document to PDF: {err}",
                ) from err

    def get_settings(self) -> OutputTypeConfig:
        """
        This parser only uses the PDF output type configuration currently
        """
        return OutputTypeConfig()