paperless-ngx/src/paperless_remote/tests/test_parser.py

import uuid
from pathlib import Path
from unittest import mock

from django.test import TestCase
from django.test import override_settings

from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless_remote.parsers import RemoteDocumentParser


class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
    SAMPLE_FILES = Path(__file__).resolve().parent / "samples"

    def assertContainsStrings(self, content, strings):
        # Asserts that all strings appear in content, in the given order.
        indices = []
        for s in strings:
            if s in content:
                indices.append(content.index(s))
            else:
                self.fail(f"'{s}' is not in '{content}'")
        self.assertListEqual(indices, sorted(indices))

    @mock.patch("paperless_remote.parsers.subprocess.run")
    @mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
    def test_get_text_with_azure(self, mock_client_cls, mock_subprocess):
        # Arrange mock Azure client
        mock_client = mock.Mock()
        mock_client_cls.return_value = mock_client

        # Simulate poller result and its `.details`
        mock_poller = mock.Mock()
        mock_poller.wait.return_value = None
        mock_poller.details = {"operation_id": "fake-op-id"}
        mock_client.begin_analyze_document.return_value = mock_poller

        # Return dummy PDF bytes
        mock_client.get_analyze_result_pdf.return_value = [
            b"%PDF-",
            b"1.7 ",
            b"FAKEPDF",
        ]

        # Simulate pdftotext by writing dummy text to sidecar file
        def fake_run(cmd, *args, **kwargs):
            with Path(cmd[-1]).open("w", encoding="utf-8") as f:
                f.write("This is a test document.")

        mock_subprocess.side_effect = fake_run

        with override_settings(
            REMOTE_OCR_ENGINE="azureai",
            REMOTE_OCR_API_KEY="somekey",
            REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
        ):
            parser = RemoteDocumentParser(uuid.uuid4())
            parser.parse(
                self.SAMPLE_FILES / "simple-digital.pdf",
                "application/pdf",
            )

            self.assertContainsStrings(
                parser.text.strip(),
                ["This is a test document."],
            )