mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-16 17:25:11 -05:00
Compare commits
4 Commits
3c75deed80
...
1002d37f6b
Author | SHA1 | Date | |
---|---|---|---|
![]() |
1002d37f6b | ||
![]() |
d260a94740 | ||
![]() |
88c69b83ea | ||
![]() |
2557ee2014 |
@ -25,9 +25,10 @@ physical documents into a searchable online archive so you can keep, well, _less
|
|||||||
## Features
|
## Features
|
||||||
|
|
||||||
- **Organize and index** your scanned documents with tags, correspondents, types, and more.
|
- **Organize and index** your scanned documents with tags, correspondents, types, and more.
|
||||||
- _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way.
|
- _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way, unless you explicitly choose to do so.
|
||||||
- Performs **OCR** on your documents, adding searchable and selectable text, even to documents scanned with only images.
|
- Performs **OCR** on your documents, adding searchable and selectable text, even to documents scanned with only images.
|
||||||
- Utilizes the open-source Tesseract engine to recognize more than 100 languages.
|
- Utilizes the open-source Tesseract engine to recognize more than 100 languages.
|
||||||
|
- _New!_ Supports remote OCR with Azure AI (opt-in).
|
||||||
- Documents are saved as PDF/A format which is designed for long term storage, alongside the unaltered originals.
|
- Documents are saved as PDF/A format which is designed for long term storage, alongside the unaltered originals.
|
||||||
- Uses machine-learning to automatically add tags, correspondents and document types to your documents.
|
- Uses machine-learning to automatically add tags, correspondents and document types to your documents.
|
||||||
- Supports PDF documents, images, plain text files, Office documents (Word, Excel, Powerpoint, and LibreOffice equivalents)[^1] and more.
|
- Supports PDF documents, images, plain text files, Office documents (Word, Excel, Powerpoint, and LibreOffice equivalents)[^1] and more.
|
||||||
|
@ -841,7 +841,7 @@ how regularly you intend to scan documents and use paperless.
|
|||||||
performed the task associated with the document, move it to the
|
performed the task associated with the document, move it to the
|
||||||
inbox.
|
inbox.
|
||||||
|
|
||||||
## Remove OCR
|
## Remote OCR
|
||||||
|
|
||||||
!!! important
|
!!! important
|
||||||
|
|
||||||
|
@ -25,14 +25,15 @@ class RemoteEngineConfig:
|
|||||||
|
|
||||||
class RemoteDocumentParser(RasterisedDocumentParser):
|
class RemoteDocumentParser(RasterisedDocumentParser):
|
||||||
"""
|
"""
|
||||||
This parser uses a remote ocr engine to parse documents
|
This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
|
||||||
|
as this is the only service that provides a remote OCR API with text-embedded PDF output.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
logging_name = "paperless.parsing.remote"
|
logging_name = "paperless.parsing.remote"
|
||||||
|
|
||||||
def get_settings(self) -> RemoteEngineConfig:
|
def get_settings(self) -> RemoteEngineConfig:
|
||||||
"""
|
"""
|
||||||
This parser uses the OCR configuration settings to parse documents
|
Returns the configuration for the remote OCR engine, loaded from Django settings.
|
||||||
"""
|
"""
|
||||||
return RemoteEngineConfig(
|
return RemoteEngineConfig(
|
||||||
engine=settings.REMOTE_OCR_ENGINE,
|
engine=settings.REMOTE_OCR_ENGINE,
|
||||||
@ -59,7 +60,11 @@ class RemoteDocumentParser(RasterisedDocumentParser):
|
|||||||
file: Path,
|
file: Path,
|
||||||
) -> str | None:
|
) -> str | None:
|
||||||
"""
|
"""
|
||||||
This method uses the Azure AI Vision API to parse documents
|
Uses Azure AI Vision to parse the document and return the text content.
|
||||||
|
It requests a searchable PDF output with embedded text.
|
||||||
|
The PDF is saved to the archive_path attribute.
|
||||||
|
Returns the text content extracted from the document.
|
||||||
|
If the parsing fails, it returns None.
|
||||||
"""
|
"""
|
||||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||||
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
||||||
|
@ -88,11 +88,6 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|||||||
|
|
||||||
def test_supported_mime_types_invalid_config(self):
|
def test_supported_mime_types_invalid_config(self):
|
||||||
parser = get_parser(uuid.uuid4())
|
parser = get_parser(uuid.uuid4())
|
||||||
# with override_settings(
|
|
||||||
# REMOTE_OCR_ENGINE=None,
|
|
||||||
# REMOTE_OCR_API_KEY=None,
|
|
||||||
# REMOTE_OCR_ENDPOINT=None,
|
|
||||||
# ):
|
|
||||||
self.assertEqual(parser.supported_mime_types(), [])
|
self.assertEqual(parser.supported_mime_types(), [])
|
||||||
|
|
||||||
@override_settings(
|
@override_settings(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user