Use output_content_format poller.result to get clean content

2025-06-20 15:17:32 -05:00 · 2025-06-17 12:52:48 -07:00 · 2025-06-17 12:52:48 -07:00 · d960aa2699
commit d960aa2699
parent 0fd6d40b37
1 changed files with 6 additions and 15 deletions
--- a/src/paperless_remote/parsers.py
+++ b/src/paperless_remote/parsers.py
@ -1,5 +1,3 @@
 import subprocess
 import tempfile
 from pathlib import Path
 from django.conf import settings
@ -65,6 +63,8 @@ class RemoteDocumentParser(RasterisedDocumentParser):
        """
        from azure.ai.documentintelligence import DocumentIntelligenceClient
        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
        from azure.ai.documentintelligence.models import AnalyzeOutputOption
        from azure.ai.documentintelligence.models import DocumentContentFormat
        from azure.core.credentials import AzureKeyCredential
        client = DocumentIntelligenceClient(
@ -77,12 +77,14 @@ class RemoteDocumentParser(RasterisedDocumentParser):
            poller = client.begin_analyze_document(
                model_id="prebuilt-read",
                body=analyze_request,
-                output=["pdf"],  # request searchable PDF output
+                output_content_format=DocumentContentFormat.TEXT,
                output=[AnalyzeOutputOption.PDF],  # request searchable PDF output
                content_type="application/json",
            )
        poller.wait()
        result_id = poller.details["operation_id"]
        result = poller.result()
        # Download the PDF with embedded text
        self.archive_path = Path(self.tempdir) / "archive.pdf"
@ -93,18 +95,7 @@ class RemoteDocumentParser(RasterisedDocumentParser):
            ):
                f.write(chunk)
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp:
+        return result.content
            subprocess.run(
                [
                    "pdftotext",
                    "-q",
                    "-layout",
                    str(self.archive_path),
                    tmp.name,
                ],
            )
            with Path(tmp.name).open(encoding="utf-8") as t:
                return t.read()
    def parse(self, document_path: Path, mime_type, file_name=None):
        if not self.settings.engine_is_valid():