mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-16 17:25:11 -05:00
Compare commits
4 Commits
3c75deed80
...
1002d37f6b
Author | SHA1 | Date | |
---|---|---|---|
![]() |
1002d37f6b | ||
![]() |
d260a94740 | ||
![]() |
88c69b83ea | ||
![]() |
2557ee2014 |
@ -25,9 +25,10 @@ physical documents into a searchable online archive so you can keep, well, _less
|
||||
## Features
|
||||
|
||||
- **Organize and index** your scanned documents with tags, correspondents, types, and more.
|
||||
- _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way.
|
||||
- _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way, unless you explicitly choose to do so.
|
||||
- Performs **OCR** on your documents, adding searchable and selectable text, even to documents scanned with only images.
|
||||
- Utilizes the open-source Tesseract engine to recognize more than 100 languages.
|
||||
- _New!_ Supports remote OCR with Azure AI (opt-in).
|
||||
- Documents are saved as PDF/A format which is designed for long term storage, alongside the unaltered originals.
|
||||
- Uses machine-learning to automatically add tags, correspondents and document types to your documents.
|
||||
- Supports PDF documents, images, plain text files, Office documents (Word, Excel, Powerpoint, and LibreOffice equivalents)[^1] and more.
|
||||
|
@ -841,7 +841,7 @@ how regularly you intend to scan documents and use paperless.
|
||||
performed the task associated with the document, move it to the
|
||||
inbox.
|
||||
|
||||
## Remove OCR
|
||||
## Remote OCR
|
||||
|
||||
!!! important
|
||||
|
||||
|
@ -25,14 +25,15 @@ class RemoteEngineConfig:
|
||||
|
||||
class RemoteDocumentParser(RasterisedDocumentParser):
|
||||
"""
|
||||
This parser uses a remote ocr engine to parse documents
|
||||
This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
|
||||
as this is the only service that provides a remote OCR API with text-embedded PDF output.
|
||||
"""
|
||||
|
||||
logging_name = "paperless.parsing.remote"
|
||||
|
||||
def get_settings(self) -> RemoteEngineConfig:
|
||||
"""
|
||||
This parser uses the OCR configuration settings to parse documents
|
||||
Returns the configuration for the remote OCR engine, loaded from Django settings.
|
||||
"""
|
||||
return RemoteEngineConfig(
|
||||
engine=settings.REMOTE_OCR_ENGINE,
|
||||
@ -59,7 +60,11 @@ class RemoteDocumentParser(RasterisedDocumentParser):
|
||||
file: Path,
|
||||
) -> str | None:
|
||||
"""
|
||||
This method uses the Azure AI Vision API to parse documents
|
||||
Uses Azure AI Vision to parse the document and return the text content.
|
||||
It requests a searchable PDF output with embedded text.
|
||||
The PDF is saved to the archive_path attribute.
|
||||
Returns the text content extracted from the document.
|
||||
If the parsing fails, it returns None.
|
||||
"""
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
||||
|
@ -88,11 +88,6 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
|
||||
def test_supported_mime_types_invalid_config(self):
|
||||
parser = get_parser(uuid.uuid4())
|
||||
# with override_settings(
|
||||
# REMOTE_OCR_ENGINE=None,
|
||||
# REMOTE_OCR_API_KEY=None,
|
||||
# REMOTE_OCR_ENDPOINT=None,
|
||||
# ):
|
||||
self.assertEqual(parser.supported_mime_types(), [])
|
||||
|
||||
@override_settings(
|
||||
|
Loading…
x
Reference in New Issue
Block a user