Use output_content_format poller.result to get clean content

This commit is contained in:
shamoon 2025-06-17 12:52:48 -07:00
parent 0fd6d40b37
commit d960aa2699
No known key found for this signature in database

View File

@ -1,5 +1,3 @@
import subprocess
import tempfile
from pathlib import Path from pathlib import Path
from django.conf import settings from django.conf import settings
@ -65,6 +63,8 @@ class RemoteDocumentParser(RasterisedDocumentParser):
""" """
from azure.ai.documentintelligence import DocumentIntelligenceClient from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from azure.ai.documentintelligence.models import AnalyzeOutputOption
from azure.ai.documentintelligence.models import DocumentContentFormat
from azure.core.credentials import AzureKeyCredential from azure.core.credentials import AzureKeyCredential
client = DocumentIntelligenceClient( client = DocumentIntelligenceClient(
@ -77,12 +77,14 @@ class RemoteDocumentParser(RasterisedDocumentParser):
poller = client.begin_analyze_document( poller = client.begin_analyze_document(
model_id="prebuilt-read", model_id="prebuilt-read",
body=analyze_request, body=analyze_request,
output=["pdf"], # request searchable PDF output output_content_format=DocumentContentFormat.TEXT,
output=[AnalyzeOutputOption.PDF], # request searchable PDF output
content_type="application/json", content_type="application/json",
) )
poller.wait() poller.wait()
result_id = poller.details["operation_id"] result_id = poller.details["operation_id"]
result = poller.result()
# Download the PDF with embedded text # Download the PDF with embedded text
self.archive_path = Path(self.tempdir) / "archive.pdf" self.archive_path = Path(self.tempdir) / "archive.pdf"
@ -93,18 +95,7 @@ class RemoteDocumentParser(RasterisedDocumentParser):
): ):
f.write(chunk) f.write(chunk)
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp: return result.content
subprocess.run(
[
"pdftotext",
"-q",
"-layout",
str(self.archive_path),
tmp.name,
],
)
with Path(tmp.name).open(encoding="utf-8") as t:
return t.read()
def parse(self, document_path: Path, mime_type, file_name=None): def parse(self, document_path: Path, mime_type, file_name=None):
if not self.settings.engine_is_valid(): if not self.settings.engine_is_valid():