mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-06-20 15:17:32 -05:00
Use output_content_format poller.result to get clean content
This commit is contained in:
parent
0fd6d40b37
commit
d960aa2699
@ -1,5 +1,3 @@
|
|||||||
import subprocess
|
|
||||||
import tempfile
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
@ -65,6 +63,8 @@ class RemoteDocumentParser(RasterisedDocumentParser):
|
|||||||
"""
|
"""
|
||||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||||
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
||||||
|
from azure.ai.documentintelligence.models import AnalyzeOutputOption
|
||||||
|
from azure.ai.documentintelligence.models import DocumentContentFormat
|
||||||
from azure.core.credentials import AzureKeyCredential
|
from azure.core.credentials import AzureKeyCredential
|
||||||
|
|
||||||
client = DocumentIntelligenceClient(
|
client = DocumentIntelligenceClient(
|
||||||
@ -77,12 +77,14 @@ class RemoteDocumentParser(RasterisedDocumentParser):
|
|||||||
poller = client.begin_analyze_document(
|
poller = client.begin_analyze_document(
|
||||||
model_id="prebuilt-read",
|
model_id="prebuilt-read",
|
||||||
body=analyze_request,
|
body=analyze_request,
|
||||||
output=["pdf"], # request searchable PDF output
|
output_content_format=DocumentContentFormat.TEXT,
|
||||||
|
output=[AnalyzeOutputOption.PDF], # request searchable PDF output
|
||||||
content_type="application/json",
|
content_type="application/json",
|
||||||
)
|
)
|
||||||
|
|
||||||
poller.wait()
|
poller.wait()
|
||||||
result_id = poller.details["operation_id"]
|
result_id = poller.details["operation_id"]
|
||||||
|
result = poller.result()
|
||||||
|
|
||||||
# Download the PDF with embedded text
|
# Download the PDF with embedded text
|
||||||
self.archive_path = Path(self.tempdir) / "archive.pdf"
|
self.archive_path = Path(self.tempdir) / "archive.pdf"
|
||||||
@ -93,18 +95,7 @@ class RemoteDocumentParser(RasterisedDocumentParser):
|
|||||||
):
|
):
|
||||||
f.write(chunk)
|
f.write(chunk)
|
||||||
|
|
||||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp:
|
return result.content
|
||||||
subprocess.run(
|
|
||||||
[
|
|
||||||
"pdftotext",
|
|
||||||
"-q",
|
|
||||||
"-layout",
|
|
||||||
str(self.archive_path),
|
|
||||||
tmp.name,
|
|
||||||
],
|
|
||||||
)
|
|
||||||
with Path(tmp.name).open(encoding="utf-8") as t:
|
|
||||||
return t.read()
|
|
||||||
|
|
||||||
def parse(self, document_path: Path, mime_type, file_name=None):
|
def parse(self, document_path: Path, mime_type, file_name=None):
|
||||||
if not self.settings.engine_is_valid():
|
if not self.settings.engine_is_valid():
|
||||||
|
Loading…
x
Reference in New Issue
Block a user