mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-08-10 00:18:57 +00:00
Basic parse
This commit is contained in:
@@ -1,3 +1,5 @@
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
@@ -61,7 +63,41 @@ class RemoteDocumentParser(RasterisedDocumentParser):
|
||||
"""
|
||||
This method uses the Azure AI Vision API to parse documents
|
||||
"""
|
||||
# TODO: Implement the Azure AI Vision API parsing logic
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.core.credentials import AzureKeyCredential
|
||||
|
||||
client = DocumentIntelligenceClient(
|
||||
endpoint=self.settings.endpoint,
|
||||
credential=AzureKeyCredential(self.settings.api_key),
|
||||
)
|
||||
|
||||
with file.open("rb") as f:
|
||||
poller = client.begin_analyze_document(
|
||||
model_id="prebuilt-read",
|
||||
analyze_request=f,
|
||||
content_type="application/octet-stream",
|
||||
output_format="pdf",
|
||||
)
|
||||
|
||||
result = poller.result()
|
||||
|
||||
# Download the PDF with embedded text
|
||||
pdf_bytes = client.get_analyze_result_pdf(result.result_id)
|
||||
self.archive_path = Path(self.tempdir) / "archive.pdf"
|
||||
self.archive_path.write_bytes(pdf_bytes)
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp:
|
||||
subprocess.run(
|
||||
[
|
||||
"pdftotext",
|
||||
"-q",
|
||||
"-layout",
|
||||
str(self.archive_path),
|
||||
tmp.name,
|
||||
],
|
||||
)
|
||||
with Path.open(tmp.name, encoding="utf-8") as t:
|
||||
return t.read()
|
||||
|
||||
def parse(self, document_path: Path, mime_type, file_name=None):
|
||||
if not self.settings.engine_is_valid():
|
||||
|
@@ -75,7 +75,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
mock_azure_client.return_value.begin_analyze_document.return_value.result.return_value = result
|
||||
|
||||
with override_settings(
|
||||
REMOTE_OCR_ENGINE="azureaivision",
|
||||
REMOTE_OCR_ENGINE="azureai",
|
||||
REMOTE_OCR_API_KEY="somekey",
|
||||
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com/",
|
||||
):
|
||||
|
Reference in New Issue
Block a user