This actually works

[ci skip]
This commit is contained in:
shamoon 2025-04-18 13:03:51 -07:00
parent b6f39b453b
commit 18e77fabf5
No known key found for this signature in database
2 changed files with 15 additions and 7 deletions

View File

@ -317,6 +317,7 @@ INSTALLED_APPS = [
"paperless_tesseract.apps.PaperlessTesseractConfig", "paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig", "paperless_text.apps.PaperlessTextConfig",
"paperless_mail.apps.PaperlessMailConfig", "paperless_mail.apps.PaperlessMailConfig",
"paperless_remote.apps.PaperlessRemoteParserConfig",
"django.contrib.admin", "django.contrib.admin",
"rest_framework", "rest_framework",
"rest_framework.authtoken", "rest_framework.authtoken",

View File

@ -64,6 +64,7 @@ class RemoteDocumentParser(RasterisedDocumentParser):
This method uses the Azure AI Vision API to parse documents This method uses the Azure AI Vision API to parse documents
""" """
from azure.ai.documentintelligence import DocumentIntelligenceClient from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from azure.core.credentials import AzureKeyCredential from azure.core.credentials import AzureKeyCredential
client = DocumentIntelligenceClient( client = DocumentIntelligenceClient(
@ -72,19 +73,25 @@ class RemoteDocumentParser(RasterisedDocumentParser):
) )
with file.open("rb") as f: with file.open("rb") as f:
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
poller = client.begin_analyze_document( poller = client.begin_analyze_document(
model_id="prebuilt-read", model_id="prebuilt-read",
analyze_request=f, body=analyze_request,
content_type="application/octet-stream", output=["pdf"], # request searchable PDF output
output_format="pdf", content_type="application/json",
) )
result = poller.result() poller.wait()
result_id = poller.details["operation_id"]
# Download the PDF with embedded text # Download the PDF with embedded text
pdf_bytes = client.get_analyze_result_pdf(result.result_id)
self.archive_path = Path(self.tempdir) / "archive.pdf" self.archive_path = Path(self.tempdir) / "archive.pdf"
self.archive_path.write_bytes(pdf_bytes) with self.archive_path.open("wb") as f:
for chunk in client.get_analyze_result_pdf(
model_id="prebuilt-read",
result_id=result_id,
):
f.write(chunk)
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp: with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp:
subprocess.run( subprocess.run(
@ -96,7 +103,7 @@ class RemoteDocumentParser(RasterisedDocumentParser):
tmp.name, tmp.name,
], ],
) )
with Path.open(tmp.name, encoding="utf-8") as t: with Path(tmp.name).open(encoding="utf-8") as t:
return t.read() return t.read()
def parse(self, document_path: Path, mime_type, file_name=None): def parse(self, document_path: Path, mime_type, file_name=None):