Use output_content_format poller.result to get clean content

This commit is contained in:
shamoon 2025-06-17 12:52:48 -07:00
parent 0fd6d40b37
commit d960aa2699
No known key found for this signature in database

View File

@ -1,5 +1,3 @@
import subprocess
import tempfile
from pathlib import Path
from django.conf import settings
@ -65,6 +63,8 @@ class RemoteDocumentParser(RasterisedDocumentParser):
"""
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from azure.ai.documentintelligence.models import AnalyzeOutputOption
from azure.ai.documentintelligence.models import DocumentContentFormat
from azure.core.credentials import AzureKeyCredential
client = DocumentIntelligenceClient(
@ -77,12 +77,14 @@ class RemoteDocumentParser(RasterisedDocumentParser):
poller = client.begin_analyze_document(
model_id="prebuilt-read",
body=analyze_request,
output=["pdf"], # request searchable PDF output
output_content_format=DocumentContentFormat.TEXT,
output=[AnalyzeOutputOption.PDF], # request searchable PDF output
content_type="application/json",
)
poller.wait()
result_id = poller.details["operation_id"]
result = poller.result()
# Download the PDF with embedded text
self.archive_path = Path(self.tempdir) / "archive.pdf"
@ -93,18 +95,7 @@ class RemoteDocumentParser(RasterisedDocumentParser):
):
f.write(chunk)
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp:
subprocess.run(
[
"pdftotext",
"-q",
"-layout",
str(self.archive_path),
tmp.name,
],
)
with Path(tmp.name).open(encoding="utf-8") as t:
return t.read()
return result.content
def parse(self, document_path: Path, mime_type, file_name=None):
if not self.settings.engine_is_valid():