paperless-ngx/src/paperless_remote/parsers.py

from pathlib import Path

from django.conf import settings

from paperless_tesseract.parsers import RasterisedDocumentParser


class RemoteEngineConfig:
    def __init__(
        self,
        engine: str,
        api_key: str | None = None,
        endpoint: str | None = None,
    ):
        self.engine = engine
        self.api_key = api_key
        self.endpoint = endpoint

    def engine_is_valid(self):
        valid = self.engine in ["azureai"] and self.api_key is not None
        if self.engine == "azureai":
            valid = valid and self.endpoint is not None
        return valid


class RemoteDocumentParser(RasterisedDocumentParser):
    """
    This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
    as this is the only service that provides a remote OCR API with text-embedded PDF output.
    """

    logging_name = "paperless.parsing.remote"

    def get_settings(self) -> RemoteEngineConfig:
        """
        Returns the configuration for the remote OCR engine, loaded from Django settings.
        """
        return RemoteEngineConfig(
            engine=settings.REMOTE_OCR_ENGINE,
            api_key=settings.REMOTE_OCR_API_KEY,
            endpoint=settings.REMOTE_OCR_ENDPOINT,
        )

    def supported_mime_types(self):
        if self.settings.engine_is_valid():
            return {
                "application/pdf": ".pdf",
                "image/png": ".png",
                "image/jpeg": ".jpg",
                "image/tiff": ".tiff",
                "image/bmp": ".bmp",
                "image/gif": ".gif",
                "image/webp": ".webp",
            }
        else:
            return {}

    def azure_ai_vision_parse(
        self,
        file: Path,
    ) -> str | None:
        """
        Uses Azure AI Vision to parse the document and return the text content.
        It requests a searchable PDF output with embedded text.
        The PDF is saved to the archive_path attribute.
        Returns the text content extracted from the document.
        If the parsing fails, it returns None.
        """
        from azure.ai.documentintelligence import DocumentIntelligenceClient
        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
        from azure.ai.documentintelligence.models import AnalyzeOutputOption
        from azure.ai.documentintelligence.models import DocumentContentFormat
        from azure.core.credentials import AzureKeyCredential

        client = DocumentIntelligenceClient(
            endpoint=self.settings.endpoint,
            credential=AzureKeyCredential(self.settings.api_key),
        )

        with file.open("rb") as f:
            analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
            poller = client.begin_analyze_document(
                model_id="prebuilt-read",
                body=analyze_request,
                output_content_format=DocumentContentFormat.TEXT,
                output=[AnalyzeOutputOption.PDF],  # request searchable PDF output
                content_type="application/json",
            )

        poller.wait()
        result_id = poller.details["operation_id"]
        result = poller.result()

        # Download the PDF with embedded text
        self.archive_path = Path(self.tempdir) / "archive.pdf"
        with self.archive_path.open("wb") as f:
            for chunk in client.get_analyze_result_pdf(
                model_id="prebuilt-read",
                result_id=result_id,
            ):
                f.write(chunk)

        return result.content

    def parse(self, document_path: Path, mime_type, file_name=None):
        if not self.settings.engine_is_valid():
            self.log.warning(
                "No valid remote parser engine is configured, content will be empty.",
            )
            self.text = ""
            return
        elif self.settings.engine == "azureai":
            self.text = self.azure_ai_vision_parse(document_path)