diff --git a/src/paperless_remote/parsers.py b/src/paperless_remote/parsers.py index 2027ea8e6..1004ead3f 100644 --- a/src/paperless_remote/parsers.py +++ b/src/paperless_remote/parsers.py @@ -1,5 +1,3 @@ -import subprocess -import tempfile from pathlib import Path from django.conf import settings @@ -65,6 +63,8 @@ class RemoteDocumentParser(RasterisedDocumentParser): """ from azure.ai.documentintelligence import DocumentIntelligenceClient from azure.ai.documentintelligence.models import AnalyzeDocumentRequest + from azure.ai.documentintelligence.models import AnalyzeOutputOption + from azure.ai.documentintelligence.models import DocumentContentFormat from azure.core.credentials import AzureKeyCredential client = DocumentIntelligenceClient( @@ -77,12 +77,14 @@ class RemoteDocumentParser(RasterisedDocumentParser): poller = client.begin_analyze_document( model_id="prebuilt-read", body=analyze_request, - output=["pdf"], # request searchable PDF output + output_content_format=DocumentContentFormat.TEXT, + output=[AnalyzeOutputOption.PDF], # request searchable PDF output content_type="application/json", ) poller.wait() result_id = poller.details["operation_id"] + result = poller.result() # Download the PDF with embedded text self.archive_path = Path(self.tempdir) / "archive.pdf" @@ -93,18 +95,7 @@ class RemoteDocumentParser(RasterisedDocumentParser): ): f.write(chunk) - with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp: - subprocess.run( - [ - "pdftotext", - "-q", - "-layout", - str(self.archive_path), - tmp.name, - ], - ) - with Path(tmp.name).open(encoding="utf-8") as t: - return t.read() + return result.content def parse(self, document_path: Path, mime_type, file_name=None): if not self.settings.engine_is_valid():