mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-05-23 12:58:18 -05:00
118 lines
3.6 KiB
Python
118 lines
3.6 KiB
Python
import subprocess
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
from django.conf import settings
|
|
|
|
from paperless_tesseract.parsers import RasterisedDocumentParser
|
|
|
|
|
|
class RemoteEngineConfig:
|
|
def __init__(
|
|
self,
|
|
engine: str,
|
|
api_key: str | None = None,
|
|
endpoint: str | None = None,
|
|
):
|
|
self.engine = engine
|
|
self.api_key = api_key
|
|
self.endpoint = endpoint
|
|
|
|
def engine_is_valid(self):
|
|
valid = self.engine in ["azureai"] and self.api_key is not None
|
|
if self.engine == "azureai":
|
|
valid = valid and self.endpoint is not None
|
|
return valid
|
|
|
|
|
|
class RemoteDocumentParser(RasterisedDocumentParser):
|
|
"""
|
|
This parser uses a remote ocr engine to parse documents
|
|
"""
|
|
|
|
logging_name = "paperless.parsing.remote"
|
|
|
|
def get_settings(self) -> RemoteEngineConfig:
|
|
"""
|
|
This parser uses the OCR configuration settings to parse documents
|
|
"""
|
|
return RemoteEngineConfig(
|
|
engine=settings.REMOTE_OCR_ENGINE,
|
|
api_key=settings.REMOTE_OCR_API_KEY,
|
|
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
|
)
|
|
|
|
def supported_mime_types(self):
|
|
if self.settings.engine_is_valid():
|
|
return [
|
|
"application/pdf",
|
|
"image/png",
|
|
"image/jpeg",
|
|
"image/tiff",
|
|
"image/bmp",
|
|
"image/gif",
|
|
"image/webp",
|
|
]
|
|
else:
|
|
return []
|
|
|
|
def azure_ai_vision_parse(
|
|
self,
|
|
file: Path,
|
|
) -> str | None:
|
|
"""
|
|
This method uses the Azure AI Vision API to parse documents
|
|
"""
|
|
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
|
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
|
from azure.core.credentials import AzureKeyCredential
|
|
|
|
client = DocumentIntelligenceClient(
|
|
endpoint=self.settings.endpoint,
|
|
credential=AzureKeyCredential(self.settings.api_key),
|
|
)
|
|
|
|
with file.open("rb") as f:
|
|
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
|
|
poller = client.begin_analyze_document(
|
|
model_id="prebuilt-read",
|
|
body=analyze_request,
|
|
output=["pdf"], # request searchable PDF output
|
|
content_type="application/json",
|
|
)
|
|
|
|
poller.wait()
|
|
result_id = poller.details["operation_id"]
|
|
|
|
# Download the PDF with embedded text
|
|
self.archive_path = Path(self.tempdir) / "archive.pdf"
|
|
with self.archive_path.open("wb") as f:
|
|
for chunk in client.get_analyze_result_pdf(
|
|
model_id="prebuilt-read",
|
|
result_id=result_id,
|
|
):
|
|
f.write(chunk)
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp:
|
|
subprocess.run(
|
|
[
|
|
"pdftotext",
|
|
"-q",
|
|
"-layout",
|
|
str(self.archive_path),
|
|
tmp.name,
|
|
],
|
|
)
|
|
with Path(tmp.name).open(encoding="utf-8") as t:
|
|
return t.read()
|
|
|
|
def parse(self, document_path: Path, mime_type, file_name=None):
|
|
if not self.settings.engine_is_valid():
|
|
self.log.warning(
|
|
"No valid remote parser engine is configured, content will be empty.",
|
|
)
|
|
self.text = ""
|
|
return
|
|
elif self.settings.engine == "azureai":
|
|
self.text = self.azure_ai_vision_parse(document_path)
|