diff --git a/src/paperless_remote/parsers.py b/src/paperless_remote/parsers.py index 4b636646f..493b7d7bb 100644 --- a/src/paperless_remote/parsers.py +++ b/src/paperless_remote/parsers.py @@ -77,31 +77,36 @@ class RemoteDocumentParser(RasterisedDocumentParser): credential=AzureKeyCredential(self.settings.api_key), ) - with file.open("rb") as f: - analyze_request = AnalyzeDocumentRequest(bytes_source=f.read()) - poller = client.begin_analyze_document( - model_id="prebuilt-read", - body=analyze_request, - output_content_format=DocumentContentFormat.TEXT, - output=[AnalyzeOutputOption.PDF], # request searchable PDF output - content_type="application/json", - ) + try: + with file.open("rb") as f: + analyze_request = AnalyzeDocumentRequest(bytes_source=f.read()) + poller = client.begin_analyze_document( + model_id="prebuilt-read", + body=analyze_request, + output_content_format=DocumentContentFormat.TEXT, + output=[AnalyzeOutputOption.PDF], # request searchable PDF output + content_type="application/json", + ) - poller.wait() - result_id = poller.details["operation_id"] - result = poller.result() + poller.wait() + result_id = poller.details["operation_id"] + result = poller.result() - # Download the PDF with embedded text - self.archive_path = self.tempdir / "archive.pdf" - with self.archive_path.open("wb") as f: - for chunk in client.get_analyze_result_pdf( - model_id="prebuilt-read", - result_id=result_id, - ): - f.write(chunk) + # Download the PDF with embedded text + self.archive_path = self.tempdir / "archive.pdf" + with self.archive_path.open("wb") as f: + for chunk in client.get_analyze_result_pdf( + model_id="prebuilt-read", + result_id=result_id, + ): + f.write(chunk) + return result.content + except Exception as e: + self.log.error(f"Azure AI Vision parsing failed: {e}") + finally: + client.close() - client.close() - return result.content + return None def parse(self, document_path: Path, mime_type, file_name=None): if not self.settings.engine_is_valid(): diff --git a/src/paperless_remote/tests/test_parser.py b/src/paperless_remote/tests/test_parser.py index aa9df6117..793778ec3 100644 --- a/src/paperless_remote/tests/test_parser.py +++ b/src/paperless_remote/tests/test_parser.py @@ -68,6 +68,33 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): ["This is a test document."], ) + @mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient") + def test_get_text_with_azure_error_logged_and_returns_none(self, mock_client_cls): + mock_client = mock.Mock() + mock_client.begin_analyze_document.side_effect = RuntimeError("fail") + mock_client_cls.return_value = mock_client + + with override_settings( + REMOTE_OCR_ENGINE="azureai", + REMOTE_OCR_API_KEY="somekey", + REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com", + ): + parser = get_parser(uuid.uuid4()) + with mock.patch.object(parser.log, "error") as mock_log_error: + parser.parse( + self.SAMPLE_FILES / "simple-digital.pdf", + "application/pdf", + ) + + self.assertIsNone(parser.text) + mock_client.begin_analyze_document.assert_called_once() + mock_client.close.assert_called_once() + mock_log_error.assert_called_once() + self.assertIn( + "Azure AI Vision parsing failed", + mock_log_error.call_args[0][0], + ) + @override_settings( REMOTE_OCR_ENGINE="azureai", REMOTE_OCR_API_KEY="key",