mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-16 17:25:11 -05:00
Compare commits
No commits in common. "1002d37f6b85733d0a3f8e3489b0ebdecbf049a9" and "3c75deed80a766ca18a8ae70c70769ed5e8ca8e2" have entirely different histories.
1002d37f6b
...
3c75deed80
@ -25,10 +25,9 @@ physical documents into a searchable online archive so you can keep, well, _less
|
||||
## Features
|
||||
|
||||
- **Organize and index** your scanned documents with tags, correspondents, types, and more.
|
||||
- _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way, unless you explicitly choose to do so.
|
||||
- _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way.
|
||||
- Performs **OCR** on your documents, adding searchable and selectable text, even to documents scanned with only images.
|
||||
- Utilizes the open-source Tesseract engine to recognize more than 100 languages.
|
||||
- _New!_ Supports remote OCR with Azure AI (opt-in).
|
||||
- Utilizes the open-source Tesseract engine to recognize more than 100 languages.
|
||||
- Documents are saved as PDF/A format which is designed for long term storage, alongside the unaltered originals.
|
||||
- Uses machine-learning to automatically add tags, correspondents and document types to your documents.
|
||||
- Supports PDF documents, images, plain text files, Office documents (Word, Excel, Powerpoint, and LibreOffice equivalents)[^1] and more.
|
||||
|
@ -841,7 +841,7 @@ how regularly you intend to scan documents and use paperless.
|
||||
performed the task associated with the document, move it to the
|
||||
inbox.
|
||||
|
||||
## Remote OCR
|
||||
## Remove OCR
|
||||
|
||||
!!! important
|
||||
|
||||
|
@ -25,15 +25,14 @@ class RemoteEngineConfig:
|
||||
|
||||
class RemoteDocumentParser(RasterisedDocumentParser):
|
||||
"""
|
||||
This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
|
||||
as this is the only service that provides a remote OCR API with text-embedded PDF output.
|
||||
This parser uses a remote ocr engine to parse documents
|
||||
"""
|
||||
|
||||
logging_name = "paperless.parsing.remote"
|
||||
|
||||
def get_settings(self) -> RemoteEngineConfig:
|
||||
"""
|
||||
Returns the configuration for the remote OCR engine, loaded from Django settings.
|
||||
This parser uses the OCR configuration settings to parse documents
|
||||
"""
|
||||
return RemoteEngineConfig(
|
||||
engine=settings.REMOTE_OCR_ENGINE,
|
||||
@ -60,11 +59,7 @@ class RemoteDocumentParser(RasterisedDocumentParser):
|
||||
file: Path,
|
||||
) -> str | None:
|
||||
"""
|
||||
Uses Azure AI Vision to parse the document and return the text content.
|
||||
It requests a searchable PDF output with embedded text.
|
||||
The PDF is saved to the archive_path attribute.
|
||||
Returns the text content extracted from the document.
|
||||
If the parsing fails, it returns None.
|
||||
This method uses the Azure AI Vision API to parse documents
|
||||
"""
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
||||
|
@ -88,6 +88,11 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
|
||||
def test_supported_mime_types_invalid_config(self):
|
||||
parser = get_parser(uuid.uuid4())
|
||||
# with override_settings(
|
||||
# REMOTE_OCR_ENGINE=None,
|
||||
# REMOTE_OCR_API_KEY=None,
|
||||
# REMOTE_OCR_ENDPOINT=None,
|
||||
# ):
|
||||
self.assertEqual(parser.supported_mime_types(), [])
|
||||
|
||||
@override_settings(
|
||||
|
Loading…
x
Reference in New Issue
Block a user