mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	Compare commits
	
		
			14 Commits
		
	
	
		
			dependabot
			...
			1002d37f6b
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					1002d37f6b | ||
| 
						 | 
					d260a94740 | ||
| 
						 | 
					88c69b83ea | ||
| 
						 | 
					2557ee2014 | ||
| 
						 | 
					3c75deed80 | ||
| 
						 | 
					d05343c927 | ||
| 
						 | 
					e7972b7eaf | ||
| 
						 | 
					75a091cc0d | ||
| 
						 | 
					dca74803fd | ||
| 
						 | 
					3cf3d868d0 | ||
| 
						 | 
					bf4fc6604a | ||
| 
						 | 
					e8c1eb86fa | ||
| 
						 | 
					c3dad3cf69 | ||
| 
						 | 
					811bd66088 | 
@@ -1759,3 +1759,23 @@ password. All of these options come from their similarly-named [Django settings]
 | 
			
		||||
#### [`PAPERLESS_EMAIL_USE_SSL=<bool>`](#PAPERLESS_EMAIL_USE_SSL) {#PAPERLESS_EMAIL_USE_SSL}
 | 
			
		||||
 | 
			
		||||
: Defaults to false.
 | 
			
		||||
 | 
			
		||||
## Remote OCR
 | 
			
		||||
 | 
			
		||||
#### [`PAPERLESS_REMOTE_OCR_ENGINE=<str>`](#PAPERLESS_REMOTE_OCR_ENGINE) {#PAPERLESS_REMOTE_OCR_ENGINE}
 | 
			
		||||
 | 
			
		||||
: The remote OCR engine to use. Currently only Azure AI is supported as "azureai".
 | 
			
		||||
 | 
			
		||||
    Defaults to None, which disables remote OCR.
 | 
			
		||||
 | 
			
		||||
#### [`PAPERLESS_REMOTE_OCR_API_KEY=<str>`](#PAPERLESS_REMOTE_OCR_API_KEY) {#PAPERLESS_REMOTE_OCR_API_KEY}
 | 
			
		||||
 | 
			
		||||
: The API key to use for the remote OCR engine.
 | 
			
		||||
 | 
			
		||||
    Defaults to None.
 | 
			
		||||
 | 
			
		||||
#### [`PAPERLESS_REMOTE_OCR_ENDPOINT=<str>`](#PAPERLESS_REMOTE_OCR_ENDPOINT) {#PAPERLESS_REMOTE_OCR_ENDPOINT}
 | 
			
		||||
 | 
			
		||||
: The endpoint to use for the remote OCR engine. This is required for Azure AI.
 | 
			
		||||
 | 
			
		||||
    Defaults to None.
 | 
			
		||||
 
 | 
			
		||||
@@ -25,9 +25,10 @@ physical documents into a searchable online archive so you can keep, well, _less
 | 
			
		||||
## Features
 | 
			
		||||
 | 
			
		||||
-   **Organize and index** your scanned documents with tags, correspondents, types, and more.
 | 
			
		||||
-   _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way.
 | 
			
		||||
-   _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way, unless you explicitly choose to do so.
 | 
			
		||||
-   Performs **OCR** on your documents, adding searchable and selectable text, even to documents scanned with only images.
 | 
			
		||||
-   Utilizes the open-source Tesseract engine to recognize more than 100 languages.
 | 
			
		||||
    -   Utilizes the open-source Tesseract engine to recognize more than 100 languages.
 | 
			
		||||
    -   _New!_ Supports remote OCR with Azure AI (opt-in).
 | 
			
		||||
-   Documents are saved as PDF/A format which is designed for long term storage, alongside the unaltered originals.
 | 
			
		||||
-   Uses machine-learning to automatically add tags, correspondents and document types to your documents.
 | 
			
		||||
-   Supports PDF documents, images, plain text files, Office documents (Word, Excel, Powerpoint, and LibreOffice equivalents)[^1] and more.
 | 
			
		||||
 
 | 
			
		||||
@@ -841,6 +841,18 @@ how regularly you intend to scan documents and use paperless.
 | 
			
		||||
    performed the task associated with the document, move it to the
 | 
			
		||||
    inbox.
 | 
			
		||||
 | 
			
		||||
## Remote OCR
 | 
			
		||||
 | 
			
		||||
!!! important
 | 
			
		||||
 | 
			
		||||
    This feature is disabled by default and will always remain strictly "opt-in".
 | 
			
		||||
 | 
			
		||||
Paperless-ngx supports performing OCR on documents using remote services. At the moment, this is limited to
 | 
			
		||||
[Microsoft's Azure "Document Intelligence" service](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence).
 | 
			
		||||
This is of course a paid service (with a free tier) which requires an Azure account and subscription. Azure AI is not affiliated with
 | 
			
		||||
Paperless-ngx in any way. When enabled, Paperless-ngx will automatically send appropriate documents to Azure for OCR processing, bypassing
 | 
			
		||||
the local OCR engine. See the [configuration](configuration.md#PAPERLESS_REMOTE_OCR_ENGINE) options for more details.
 | 
			
		||||
 | 
			
		||||
## Architecture
 | 
			
		||||
 | 
			
		||||
Paperless-ngx consists of the following components:
 | 
			
		||||
 
 | 
			
		||||
@@ -15,6 +15,7 @@ classifiers = [
 | 
			
		||||
# This will allow testing to not install a webserver, mysql, etc
 | 
			
		||||
 | 
			
		||||
dependencies = [
 | 
			
		||||
  "azure-ai-documentintelligence>=1.0.2",
 | 
			
		||||
  "bleach~=6.2.0",
 | 
			
		||||
  "celery[redis]~=5.5.1",
 | 
			
		||||
  "channels~=4.2",
 | 
			
		||||
@@ -239,6 +240,7 @@ testpaths = [
 | 
			
		||||
  "src/paperless_mail/tests/",
 | 
			
		||||
  "src/paperless_tesseract/tests/",
 | 
			
		||||
  "src/paperless_tika/tests",
 | 
			
		||||
  "src/paperless_remote/tests/",
 | 
			
		||||
]
 | 
			
		||||
addopts = [
 | 
			
		||||
  "--pythonwarnings=all",
 | 
			
		||||
 
 | 
			
		||||
@@ -324,6 +324,7 @@ INSTALLED_APPS = [
 | 
			
		||||
    "paperless_tesseract.apps.PaperlessTesseractConfig",
 | 
			
		||||
    "paperless_text.apps.PaperlessTextConfig",
 | 
			
		||||
    "paperless_mail.apps.PaperlessMailConfig",
 | 
			
		||||
    "paperless_remote.apps.PaperlessRemoteParserConfig",
 | 
			
		||||
    "django.contrib.admin",
 | 
			
		||||
    "rest_framework",
 | 
			
		||||
    "rest_framework.authtoken",
 | 
			
		||||
@@ -1409,3 +1410,11 @@ OUTLOOK_OAUTH_ENABLED = bool(
 | 
			
		||||
    and OUTLOOK_OAUTH_CLIENT_ID
 | 
			
		||||
    and OUTLOOK_OAUTH_CLIENT_SECRET,
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
###############################################################################
 | 
			
		||||
# Remote Parser                                                               #
 | 
			
		||||
###############################################################################
 | 
			
		||||
 | 
			
		||||
REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
 | 
			
		||||
REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
 | 
			
		||||
REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										4
									
								
								src/paperless_remote/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								src/paperless_remote/__init__.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,4 @@
 | 
			
		||||
# this is here so that django finds the checks.
 | 
			
		||||
from paperless_remote.checks import check_remote_parser_configured
 | 
			
		||||
 | 
			
		||||
__all__ = ["check_remote_parser_configured"]
 | 
			
		||||
							
								
								
									
										14
									
								
								src/paperless_remote/apps.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								src/paperless_remote/apps.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,14 @@
 | 
			
		||||
from django.apps import AppConfig
 | 
			
		||||
 | 
			
		||||
from paperless_remote.signals import remote_consumer_declaration
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class PaperlessRemoteParserConfig(AppConfig):
 | 
			
		||||
    name = "paperless_remote"
 | 
			
		||||
 | 
			
		||||
    def ready(self):
 | 
			
		||||
        from documents.signals import document_consumer_declaration
 | 
			
		||||
 | 
			
		||||
        document_consumer_declaration.connect(remote_consumer_declaration)
 | 
			
		||||
 | 
			
		||||
        AppConfig.ready(self)
 | 
			
		||||
							
								
								
									
										15
									
								
								src/paperless_remote/checks.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								src/paperless_remote/checks.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,15 @@
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from django.core.checks import Error
 | 
			
		||||
from django.core.checks import register
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@register()
 | 
			
		||||
def check_remote_parser_configured(app_configs, **kwargs):
 | 
			
		||||
    if settings.REMOTE_OCR_ENGINE == "azureai" and not settings.REMOTE_OCR_ENDPOINT:
 | 
			
		||||
        return [
 | 
			
		||||
            Error(
 | 
			
		||||
                "Azure AI remote parser requires endpoint to be configured.",
 | 
			
		||||
            ),
 | 
			
		||||
        ]
 | 
			
		||||
 | 
			
		||||
    return []
 | 
			
		||||
							
								
								
									
										113
									
								
								src/paperless_remote/parsers.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										113
									
								
								src/paperless_remote/parsers.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,113 @@
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
 | 
			
		||||
from paperless_tesseract.parsers import RasterisedDocumentParser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class RemoteEngineConfig:
 | 
			
		||||
    def __init__(
 | 
			
		||||
        self,
 | 
			
		||||
        engine: str,
 | 
			
		||||
        api_key: str | None = None,
 | 
			
		||||
        endpoint: str | None = None,
 | 
			
		||||
    ):
 | 
			
		||||
        self.engine = engine
 | 
			
		||||
        self.api_key = api_key
 | 
			
		||||
        self.endpoint = endpoint
 | 
			
		||||
 | 
			
		||||
    def engine_is_valid(self):
 | 
			
		||||
        valid = self.engine in ["azureai"] and self.api_key is not None
 | 
			
		||||
        if self.engine == "azureai":
 | 
			
		||||
            valid = valid and self.endpoint is not None
 | 
			
		||||
        return valid
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class RemoteDocumentParser(RasterisedDocumentParser):
 | 
			
		||||
    """
 | 
			
		||||
    This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
 | 
			
		||||
    as this is the only service that provides a remote OCR API with text-embedded PDF output.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    logging_name = "paperless.parsing.remote"
 | 
			
		||||
 | 
			
		||||
    def get_settings(self) -> RemoteEngineConfig:
 | 
			
		||||
        """
 | 
			
		||||
        Returns the configuration for the remote OCR engine, loaded from Django settings.
 | 
			
		||||
        """
 | 
			
		||||
        return RemoteEngineConfig(
 | 
			
		||||
            engine=settings.REMOTE_OCR_ENGINE,
 | 
			
		||||
            api_key=settings.REMOTE_OCR_API_KEY,
 | 
			
		||||
            endpoint=settings.REMOTE_OCR_ENDPOINT,
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def supported_mime_types(self):
 | 
			
		||||
        if self.settings.engine_is_valid():
 | 
			
		||||
            return [
 | 
			
		||||
                "application/pdf",
 | 
			
		||||
                "image/png",
 | 
			
		||||
                "image/jpeg",
 | 
			
		||||
                "image/tiff",
 | 
			
		||||
                "image/bmp",
 | 
			
		||||
                "image/gif",
 | 
			
		||||
                "image/webp",
 | 
			
		||||
            ]
 | 
			
		||||
        else:
 | 
			
		||||
            return []
 | 
			
		||||
 | 
			
		||||
    def azure_ai_vision_parse(
 | 
			
		||||
        self,
 | 
			
		||||
        file: Path,
 | 
			
		||||
    ) -> str | None:
 | 
			
		||||
        """
 | 
			
		||||
        Uses Azure AI Vision to parse the document and return the text content.
 | 
			
		||||
        It requests a searchable PDF output with embedded text.
 | 
			
		||||
        The PDF is saved to the archive_path attribute.
 | 
			
		||||
        Returns the text content extracted from the document.
 | 
			
		||||
        If the parsing fails, it returns None.
 | 
			
		||||
        """
 | 
			
		||||
        from azure.ai.documentintelligence import DocumentIntelligenceClient
 | 
			
		||||
        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
 | 
			
		||||
        from azure.ai.documentintelligence.models import AnalyzeOutputOption
 | 
			
		||||
        from azure.ai.documentintelligence.models import DocumentContentFormat
 | 
			
		||||
        from azure.core.credentials import AzureKeyCredential
 | 
			
		||||
 | 
			
		||||
        client = DocumentIntelligenceClient(
 | 
			
		||||
            endpoint=self.settings.endpoint,
 | 
			
		||||
            credential=AzureKeyCredential(self.settings.api_key),
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        with file.open("rb") as f:
 | 
			
		||||
            analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
 | 
			
		||||
            poller = client.begin_analyze_document(
 | 
			
		||||
                model_id="prebuilt-read",
 | 
			
		||||
                body=analyze_request,
 | 
			
		||||
                output_content_format=DocumentContentFormat.TEXT,
 | 
			
		||||
                output=[AnalyzeOutputOption.PDF],  # request searchable PDF output
 | 
			
		||||
                content_type="application/json",
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        poller.wait()
 | 
			
		||||
        result_id = poller.details["operation_id"]
 | 
			
		||||
        result = poller.result()
 | 
			
		||||
 | 
			
		||||
        # Download the PDF with embedded text
 | 
			
		||||
        self.archive_path = Path(self.tempdir) / "archive.pdf"
 | 
			
		||||
        with self.archive_path.open("wb") as f:
 | 
			
		||||
            for chunk in client.get_analyze_result_pdf(
 | 
			
		||||
                model_id="prebuilt-read",
 | 
			
		||||
                result_id=result_id,
 | 
			
		||||
            ):
 | 
			
		||||
                f.write(chunk)
 | 
			
		||||
 | 
			
		||||
        return result.content
 | 
			
		||||
 | 
			
		||||
    def parse(self, document_path: Path, mime_type, file_name=None):
 | 
			
		||||
        if not self.settings.engine_is_valid():
 | 
			
		||||
            self.log.warning(
 | 
			
		||||
                "No valid remote parser engine is configured, content will be empty.",
 | 
			
		||||
            )
 | 
			
		||||
            self.text = ""
 | 
			
		||||
            return
 | 
			
		||||
        elif self.settings.engine == "azureai":
 | 
			
		||||
            self.text = self.azure_ai_vision_parse(document_path)
 | 
			
		||||
							
								
								
									
										18
									
								
								src/paperless_remote/signals.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								src/paperless_remote/signals.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,18 @@
 | 
			
		||||
def get_parser(*args, **kwargs):
 | 
			
		||||
    from paperless_remote.parsers import RemoteDocumentParser
 | 
			
		||||
 | 
			
		||||
    return RemoteDocumentParser(*args, **kwargs)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_supported_mime_types():
 | 
			
		||||
    from paperless_remote.parsers import RemoteDocumentParser
 | 
			
		||||
 | 
			
		||||
    return RemoteDocumentParser(None).supported_mime_types()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def remote_consumer_declaration(sender, **kwargs):
 | 
			
		||||
    return {
 | 
			
		||||
        "parser": get_parser,
 | 
			
		||||
        "weight": 5,
 | 
			
		||||
        "mime_types": get_supported_mime_types(),
 | 
			
		||||
    }
 | 
			
		||||
							
								
								
									
										0
									
								
								src/paperless_remote/tests/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/paperless_remote/tests/__init__.py
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										
											BIN
										
									
								
								src/paperless_remote/tests/samples/simple-digital.pdf
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/paperless_remote/tests/samples/simple-digital.pdf
									
									
									
									
									
										Normal file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										29
									
								
								src/paperless_remote/tests/test_checks.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								src/paperless_remote/tests/test_checks.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,29 @@
 | 
			
		||||
from django.test import TestCase
 | 
			
		||||
from django.test import override_settings
 | 
			
		||||
 | 
			
		||||
from paperless_remote import check_remote_parser_configured
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TestChecks(TestCase):
 | 
			
		||||
    @override_settings(REMOTE_OCR_ENGINE=None)
 | 
			
		||||
    def test_no_engine(self):
 | 
			
		||||
        msgs = check_remote_parser_configured(None)
 | 
			
		||||
        self.assertEqual(len(msgs), 0)
 | 
			
		||||
 | 
			
		||||
    @override_settings(REMOTE_OCR_ENGINE="azureai")
 | 
			
		||||
    @override_settings(REMOTE_OCR_API_KEY="somekey")
 | 
			
		||||
    @override_settings(REMOTE_OCR_ENDPOINT=None)
 | 
			
		||||
    def test_azure_no_endpoint(self):
 | 
			
		||||
        msgs = check_remote_parser_configured(None)
 | 
			
		||||
        self.assertEqual(len(msgs), 1)
 | 
			
		||||
        self.assertTrue(
 | 
			
		||||
            msgs[0].msg.startswith(
 | 
			
		||||
                "Azure AI remote parser requires endpoint to be configured.",
 | 
			
		||||
            ),
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    @override_settings(REMOTE_OCR_ENGINE="something")
 | 
			
		||||
    @override_settings(REMOTE_OCR_API_KEY="somekey")
 | 
			
		||||
    def test_valid_configuration(self):
 | 
			
		||||
        msgs = check_remote_parser_configured(None)
 | 
			
		||||
        self.assertEqual(len(msgs), 0)
 | 
			
		||||
							
								
								
									
										101
									
								
								src/paperless_remote/tests/test_parser.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										101
									
								
								src/paperless_remote/tests/test_parser.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,101 @@
 | 
			
		||||
import uuid
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from unittest import mock
 | 
			
		||||
 | 
			
		||||
from django.test import TestCase
 | 
			
		||||
from django.test import override_settings
 | 
			
		||||
 | 
			
		||||
from documents.tests.utils import DirectoriesMixin
 | 
			
		||||
from documents.tests.utils import FileSystemAssertsMixin
 | 
			
		||||
from paperless_remote.parsers import RemoteDocumentParser
 | 
			
		||||
from paperless_remote.signals import get_parser
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
 | 
			
		||||
    SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
 | 
			
		||||
 | 
			
		||||
    def assertContainsStrings(self, content, strings):
 | 
			
		||||
        # Asserts that all strings appear in content, in the given order.
 | 
			
		||||
        indices = []
 | 
			
		||||
        for s in strings:
 | 
			
		||||
            if s in content:
 | 
			
		||||
                indices.append(content.index(s))
 | 
			
		||||
            else:
 | 
			
		||||
                self.fail(f"'{s}' is not in '{content}'")
 | 
			
		||||
        self.assertListEqual(indices, sorted(indices))
 | 
			
		||||
 | 
			
		||||
    @mock.patch("paperless_tesseract.parsers.run_subprocess")
 | 
			
		||||
    @mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
 | 
			
		||||
    def test_get_text_with_azure(self, mock_client_cls, mock_subprocess):
 | 
			
		||||
        # Arrange mock Azure client
 | 
			
		||||
        mock_client = mock.Mock()
 | 
			
		||||
        mock_client_cls.return_value = mock_client
 | 
			
		||||
 | 
			
		||||
        # Simulate poller result and its `.details`
 | 
			
		||||
        mock_poller = mock.Mock()
 | 
			
		||||
        mock_poller.wait.return_value = None
 | 
			
		||||
        mock_poller.details = {"operation_id": "fake-op-id"}
 | 
			
		||||
        mock_client.begin_analyze_document.return_value = mock_poller
 | 
			
		||||
        mock_poller.result.return_value.content = "This is a test document."
 | 
			
		||||
 | 
			
		||||
        # Return dummy PDF bytes
 | 
			
		||||
        mock_client.get_analyze_result_pdf.return_value = [
 | 
			
		||||
            b"%PDF-",
 | 
			
		||||
            b"1.7 ",
 | 
			
		||||
            b"FAKEPDF",
 | 
			
		||||
        ]
 | 
			
		||||
 | 
			
		||||
        # Simulate pdftotext by writing dummy text to sidecar file
 | 
			
		||||
        def fake_run(cmd, *args, **kwargs):
 | 
			
		||||
            with Path(cmd[-1]).open("w", encoding="utf-8") as f:
 | 
			
		||||
                f.write("This is a test document.")
 | 
			
		||||
 | 
			
		||||
        mock_subprocess.side_effect = fake_run
 | 
			
		||||
 | 
			
		||||
        with override_settings(
 | 
			
		||||
            REMOTE_OCR_ENGINE="azureai",
 | 
			
		||||
            REMOTE_OCR_API_KEY="somekey",
 | 
			
		||||
            REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
 | 
			
		||||
        ):
 | 
			
		||||
            parser = get_parser(uuid.uuid4())
 | 
			
		||||
            parser.parse(
 | 
			
		||||
                self.SAMPLE_FILES / "simple-digital.pdf",
 | 
			
		||||
                "application/pdf",
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            self.assertContainsStrings(
 | 
			
		||||
                parser.text.strip(),
 | 
			
		||||
                ["This is a test document."],
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    @override_settings(
 | 
			
		||||
        REMOTE_OCR_ENGINE="azureai",
 | 
			
		||||
        REMOTE_OCR_API_KEY="key",
 | 
			
		||||
        REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
 | 
			
		||||
    )
 | 
			
		||||
    def test_supported_mime_types_valid_config(self):
 | 
			
		||||
        parser = RemoteDocumentParser(uuid.uuid4())
 | 
			
		||||
        expected_types = [
 | 
			
		||||
            "application/pdf",
 | 
			
		||||
            "image/png",
 | 
			
		||||
            "image/jpeg",
 | 
			
		||||
            "image/tiff",
 | 
			
		||||
            "image/bmp",
 | 
			
		||||
            "image/gif",
 | 
			
		||||
            "image/webp",
 | 
			
		||||
        ]
 | 
			
		||||
        self.assertEqual(parser.supported_mime_types(), expected_types)
 | 
			
		||||
 | 
			
		||||
    def test_supported_mime_types_invalid_config(self):
 | 
			
		||||
        parser = get_parser(uuid.uuid4())
 | 
			
		||||
        self.assertEqual(parser.supported_mime_types(), [])
 | 
			
		||||
 | 
			
		||||
    @override_settings(
 | 
			
		||||
        REMOTE_OCR_ENGINE=None,
 | 
			
		||||
        REMOTE_OCR_API_KEY=None,
 | 
			
		||||
        REMOTE_OCR_ENDPOINT=None,
 | 
			
		||||
    )
 | 
			
		||||
    def test_parse_with_invalid_config(self):
 | 
			
		||||
        parser = get_parser(uuid.uuid4())
 | 
			
		||||
        parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf")
 | 
			
		||||
        self.assertEqual(parser.text, "")
 | 
			
		||||
							
								
								
									
										39
									
								
								uv.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										39
									
								
								uv.lock
									
									
									
										generated
									
									
									
								
							@@ -93,6 +93,34 @@ wheels = [
 | 
			
		||||
    { url = "https://files.pythonhosted.org/packages/af/cc/55a32a2c98022d88812b5986d2a92c4ff3ee087e83b712ebc703bba452bf/Automat-24.8.1-py3-none-any.whl", hash = "sha256:bf029a7bc3da1e2c24da2343e7598affaa9f10bf0ab63ff808566ce90551e02a", size = 42585, upload-time = "2024-08-19T17:31:56.729Z" },
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
[[package]]
 | 
			
		||||
name = "azure-ai-documentintelligence"
 | 
			
		||||
version = "1.0.2"
 | 
			
		||||
source = { registry = "https://pypi.org/simple" }
 | 
			
		||||
dependencies = [
 | 
			
		||||
    { name = "azure-core", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 | 
			
		||||
    { name = "isodate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 | 
			
		||||
    { name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 | 
			
		||||
]
 | 
			
		||||
sdist = { url = "https://files.pythonhosted.org/packages/44/7b/8115cd713e2caa5e44def85f2b7ebd02a74ae74d7113ba20bdd41fd6dd80/azure_ai_documentintelligence-1.0.2.tar.gz", hash = "sha256:4d75a2513f2839365ebabc0e0e1772f5601b3a8c9a71e75da12440da13b63484", size = 170940 }
 | 
			
		||||
wheels = [
 | 
			
		||||
    { url = "https://files.pythonhosted.org/packages/d9/75/c9ec040f23082f54ffb1977ff8f364c2d21c79a640a13d1c1809e7fd6b1a/azure_ai_documentintelligence-1.0.2-py3-none-any.whl", hash = "sha256:e1fb446abbdeccc9759d897898a0fe13141ed29f9ad11fc705f951925822ed59", size = 106005 },
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
[[package]]
 | 
			
		||||
name = "azure-core"
 | 
			
		||||
version = "1.33.0"
 | 
			
		||||
source = { registry = "https://pypi.org/simple" }
 | 
			
		||||
dependencies = [
 | 
			
		||||
    { name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 | 
			
		||||
    { name = "six", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 | 
			
		||||
    { name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 | 
			
		||||
]
 | 
			
		||||
sdist = { url = "https://files.pythonhosted.org/packages/75/aa/7c9db8edd626f1a7d99d09ef7926f6f4fb34d5f9fa00dc394afdfe8e2a80/azure_core-1.33.0.tar.gz", hash = "sha256:f367aa07b5e3005fec2c1e184b882b0b039910733907d001c20fb08ebb8c0eb9", size = 295633 }
 | 
			
		||||
wheels = [
 | 
			
		||||
    { url = "https://files.pythonhosted.org/packages/07/b7/76b7e144aa53bd206bf1ce34fa75350472c3f69bf30e5c8c18bc9881035d/azure_core-1.33.0-py3-none-any.whl", hash = "sha256:9b5b6d0223a1d38c37500e6971118c1e0f13f54951e6893968b38910bc9cda8f", size = 207071 },
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
[[package]]
 | 
			
		||||
name = "babel"
 | 
			
		||||
version = "2.17.0"
 | 
			
		||||
@@ -1383,6 +1411,15 @@ wheels = [
 | 
			
		||||
    { url = "https://files.pythonhosted.org/packages/c7/fc/4e5a141c3f7c7bed550ac1f69e599e92b6be449dd4677ec09f325cad0955/inotifyrecursive-0.3.5-py3-none-any.whl", hash = "sha256:7e5f4a2e1dc2bef0efa3b5f6b339c41fb4599055a2b54909d020e9e932cc8d2f", size = 8009, upload-time = "2020-11-20T12:38:46.981Z" },
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
[[package]]
 | 
			
		||||
name = "isodate"
 | 
			
		||||
version = "0.7.2"
 | 
			
		||||
source = { registry = "https://pypi.org/simple" }
 | 
			
		||||
sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705 }
 | 
			
		||||
wheels = [
 | 
			
		||||
    { url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320 },
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
[[package]]
 | 
			
		||||
name = "jinja2"
 | 
			
		||||
version = "3.1.6"
 | 
			
		||||
@@ -1911,6 +1948,7 @@ name = "paperless-ngx"
 | 
			
		||||
version = "2.17.1"
 | 
			
		||||
source = { virtual = "." }
 | 
			
		||||
dependencies = [
 | 
			
		||||
    { name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 | 
			
		||||
    { name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 | 
			
		||||
    { name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 | 
			
		||||
    { name = "channels", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 | 
			
		||||
@@ -2042,6 +2080,7 @@ typing = [
 | 
			
		||||
 | 
			
		||||
[package.metadata]
 | 
			
		||||
requires-dist = [
 | 
			
		||||
    { name = "azure-ai-documentintelligence", specifier = ">=1.0.2" },
 | 
			
		||||
    { name = "bleach", specifier = "~=6.2.0" },
 | 
			
		||||
    { name = "celery", extras = ["redis"], specifier = "~=5.5.1" },
 | 
			
		||||
    { name = "channels", specifier = "~=4.2" },
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user