mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-18 17:34:39 -05:00
Compare commits
No commits in common. "1002d37f6b85733d0a3f8e3489b0ebdecbf049a9" and "3c75deed80a766ca18a8ae70c70769ed5e8ca8e2" have entirely different histories.
1002d37f6b
...
3c75deed80
@ -25,10 +25,9 @@ physical documents into a searchable online archive so you can keep, well, _less
|
|||||||
## Features
|
## Features
|
||||||
|
|
||||||
- **Organize and index** your scanned documents with tags, correspondents, types, and more.
|
- **Organize and index** your scanned documents with tags, correspondents, types, and more.
|
||||||
- _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way, unless you explicitly choose to do so.
|
- _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way.
|
||||||
- Performs **OCR** on your documents, adding searchable and selectable text, even to documents scanned with only images.
|
- Performs **OCR** on your documents, adding searchable and selectable text, even to documents scanned with only images.
|
||||||
- Utilizes the open-source Tesseract engine to recognize more than 100 languages.
|
- Utilizes the open-source Tesseract engine to recognize more than 100 languages.
|
||||||
- _New!_ Supports remote OCR with Azure AI (opt-in).
|
|
||||||
- Documents are saved as PDF/A format which is designed for long term storage, alongside the unaltered originals.
|
- Documents are saved as PDF/A format which is designed for long term storage, alongside the unaltered originals.
|
||||||
- Uses machine-learning to automatically add tags, correspondents and document types to your documents.
|
- Uses machine-learning to automatically add tags, correspondents and document types to your documents.
|
||||||
- Supports PDF documents, images, plain text files, Office documents (Word, Excel, Powerpoint, and LibreOffice equivalents)[^1] and more.
|
- Supports PDF documents, images, plain text files, Office documents (Word, Excel, Powerpoint, and LibreOffice equivalents)[^1] and more.
|
||||||
|
@ -841,7 +841,7 @@ how regularly you intend to scan documents and use paperless.
|
|||||||
performed the task associated with the document, move it to the
|
performed the task associated with the document, move it to the
|
||||||
inbox.
|
inbox.
|
||||||
|
|
||||||
## Remote OCR
|
## Remove OCR
|
||||||
|
|
||||||
!!! important
|
!!! important
|
||||||
|
|
||||||
|
@ -25,15 +25,14 @@ class RemoteEngineConfig:
|
|||||||
|
|
||||||
class RemoteDocumentParser(RasterisedDocumentParser):
|
class RemoteDocumentParser(RasterisedDocumentParser):
|
||||||
"""
|
"""
|
||||||
This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
|
This parser uses a remote ocr engine to parse documents
|
||||||
as this is the only service that provides a remote OCR API with text-embedded PDF output.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
logging_name = "paperless.parsing.remote"
|
logging_name = "paperless.parsing.remote"
|
||||||
|
|
||||||
def get_settings(self) -> RemoteEngineConfig:
|
def get_settings(self) -> RemoteEngineConfig:
|
||||||
"""
|
"""
|
||||||
Returns the configuration for the remote OCR engine, loaded from Django settings.
|
This parser uses the OCR configuration settings to parse documents
|
||||||
"""
|
"""
|
||||||
return RemoteEngineConfig(
|
return RemoteEngineConfig(
|
||||||
engine=settings.REMOTE_OCR_ENGINE,
|
engine=settings.REMOTE_OCR_ENGINE,
|
||||||
@ -60,11 +59,7 @@ class RemoteDocumentParser(RasterisedDocumentParser):
|
|||||||
file: Path,
|
file: Path,
|
||||||
) -> str | None:
|
) -> str | None:
|
||||||
"""
|
"""
|
||||||
Uses Azure AI Vision to parse the document and return the text content.
|
This method uses the Azure AI Vision API to parse documents
|
||||||
It requests a searchable PDF output with embedded text.
|
|
||||||
The PDF is saved to the archive_path attribute.
|
|
||||||
Returns the text content extracted from the document.
|
|
||||||
If the parsing fails, it returns None.
|
|
||||||
"""
|
"""
|
||||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||||
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
||||||
|
@ -88,6 +88,11 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|||||||
|
|
||||||
def test_supported_mime_types_invalid_config(self):
|
def test_supported_mime_types_invalid_config(self):
|
||||||
parser = get_parser(uuid.uuid4())
|
parser = get_parser(uuid.uuid4())
|
||||||
|
# with override_settings(
|
||||||
|
# REMOTE_OCR_ENGINE=None,
|
||||||
|
# REMOTE_OCR_API_KEY=None,
|
||||||
|
# REMOTE_OCR_ENDPOINT=None,
|
||||||
|
# ):
|
||||||
self.assertEqual(parser.supported_mime_types(), [])
|
self.assertEqual(parser.supported_mime_types(), [])
|
||||||
|
|
||||||
@override_settings(
|
@override_settings(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user