mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-05-23 12:58:18 -05:00
68 lines
2.3 KiB
Python
68 lines
2.3 KiB
Python
import uuid
|
|
from pathlib import Path
|
|
from unittest import mock
|
|
|
|
from django.test import TestCase
|
|
from django.test import override_settings
|
|
|
|
from documents.tests.utils import DirectoriesMixin
|
|
from documents.tests.utils import FileSystemAssertsMixin
|
|
from paperless_remote.parsers import RemoteDocumentParser
|
|
|
|
|
|
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
|
|
|
|
def assertContainsStrings(self, content, strings):
|
|
# Asserts that all strings appear in content, in the given order.
|
|
indices = []
|
|
for s in strings:
|
|
if s in content:
|
|
indices.append(content.index(s))
|
|
else:
|
|
self.fail(f"'{s}' is not in '{content}'")
|
|
self.assertListEqual(indices, sorted(indices))
|
|
|
|
@mock.patch("paperless_remote.parsers.subprocess.run")
|
|
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
|
|
def test_get_text_with_azure(self, mock_client_cls, mock_subprocess):
|
|
# Arrange mock Azure client
|
|
mock_client = mock.Mock()
|
|
mock_client_cls.return_value = mock_client
|
|
|
|
# Simulate poller result and its `.details`
|
|
mock_poller = mock.Mock()
|
|
mock_poller.wait.return_value = None
|
|
mock_poller.details = {"operation_id": "fake-op-id"}
|
|
mock_client.begin_analyze_document.return_value = mock_poller
|
|
|
|
# Return dummy PDF bytes
|
|
mock_client.get_analyze_result_pdf.return_value = [
|
|
b"%PDF-",
|
|
b"1.7 ",
|
|
b"FAKEPDF",
|
|
]
|
|
|
|
# Simulate pdftotext by writing dummy text to sidecar file
|
|
def fake_run(cmd, *args, **kwargs):
|
|
with Path(cmd[-1]).open("w", encoding="utf-8") as f:
|
|
f.write("This is a test document.")
|
|
|
|
mock_subprocess.side_effect = fake_run
|
|
|
|
with override_settings(
|
|
REMOTE_OCR_ENGINE="azureai",
|
|
REMOTE_OCR_API_KEY="somekey",
|
|
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
|
|
):
|
|
parser = RemoteDocumentParser(uuid.uuid4())
|
|
parser.parse(
|
|
self.SAMPLE_FILES / "simple-digital.pdf",
|
|
"application/pdf",
|
|
)
|
|
|
|
self.assertContainsStrings(
|
|
parser.text.strip(),
|
|
["This is a test document."],
|
|
)
|