Merge branch 'dev' into feature-remote-ocr-2

Merge branch 'main' into dev
Fix: add extra error handling to _consume for file checks (#10897 )
2025-09-22 00:52:42 -05:00 · 2025-09-21 16:18:13 -07:00 · 2025-09-21 16:10:00 -07:00 · 2025-09-21 13:21:40 -07:00 · 2025-09-21 10:09:05 -07:00 · 2025-09-17 19:18:47 -07:00
21 changed files with 437 additions and 195 deletions
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -1759,6 +1759,11 @@ started by the container.
 : Path to an image file in the /media/logo directory, must include 'logo', e.g. `/logo/Atari_logo.svg`
 !!! note
    The logo file will be viewable by anyone with access to the Paperless instance login page,
    so consider your choice of logo carefully and removing exif data from images before uploading.
 #### [`PAPERLESS_ENABLE_UPDATE_CHECK=<bool>`](#PAPERLESS_ENABLE_UPDATE_CHECK) {#PAPERLESS_ENABLE_UPDATE_CHECK}
 !!! note
@@ -1800,3 +1805,23 @@ password. All of these options come from their similarly-named [Django settings]
 #### [`PAPERLESS_EMAIL_USE_SSL=<bool>`](#PAPERLESS_EMAIL_USE_SSL) {#PAPERLESS_EMAIL_USE_SSL}
 : Defaults to false.
 ## Remote OCR
 #### [`PAPERLESS_REMOTE_OCR_ENGINE=<str>`](#PAPERLESS_REMOTE_OCR_ENGINE) {#PAPERLESS_REMOTE_OCR_ENGINE}
 : The remote OCR engine to use. Currently only Azure AI is supported as "azureai".
    Defaults to None, which disables remote OCR.
 #### [`PAPERLESS_REMOTE_OCR_API_KEY=<str>`](#PAPERLESS_REMOTE_OCR_API_KEY) {#PAPERLESS_REMOTE_OCR_API_KEY}
 : The API key to use for the remote OCR engine.
    Defaults to None.
 #### [`PAPERLESS_REMOTE_OCR_ENDPOINT=<str>`](#PAPERLESS_REMOTE_OCR_ENDPOINT) {#PAPERLESS_REMOTE_OCR_ENDPOINT}
 : The endpoint to use for the remote OCR engine. This is required for Azure AI.
    Defaults to None.
--- a/docs/index.md
+++ b/docs/index.md
@@ -25,9 +25,10 @@ physical documents into a searchable online archive so you can keep, well, _less
 ## Features
 -   **Organize and index** your scanned documents with tags, correspondents, types, and more.
-   _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way.
+-   _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way, unless you explicitly choose to do so.
 -   Performs **OCR** on your documents, adding searchable and selectable text, even to documents scanned with only images.
    -   Utilizes the open-source Tesseract engine to recognize more than 100 languages.
    -   _New!_ Supports remote OCR with Azure AI (opt-in).
 -   Documents are saved as PDF/A format which is designed for long term storage, alongside the unaltered originals.
 -   Uses machine-learning to automatically add tags, correspondents and document types to your documents.
 -   Supports PDF documents, images, plain text files, Office documents (Word, Excel, PowerPoint, and LibreOffice equivalents)[^1] and more.
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -878,6 +878,21 @@ how regularly you intend to scan documents and use paperless.
    performed the task associated with the document, move it to the
    inbox.
 ## Remote OCR
 !!! important
    This feature is disabled by default and will always remain strictly "opt-in".
 Paperless-ngx supports performing OCR on documents using remote services. At the moment, this is limited to
 [Microsoft's Azure "Document Intelligence" service](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence).
 This is of course a paid service (with a free tier) which requires an Azure account and subscription. Azure AI is not affiliated with
 Paperless-ngx in any way. When enabled, Paperless-ngx will automatically send appropriate documents to Azure for OCR processing, bypassing
 the local OCR engine. See the [configuration](configuration.md#PAPERLESS_REMOTE_OCR_ENGINE) options for more details.
 Additionally, when using a commercial service with this feature, consider both potential costs as well as any associated file size
 or page limitations (e.g. with a free tier).
 ## Architecture
 Paperless-ngx consists of the following components:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ classifiers = [
 # This will allow testing to not install a webserver, mysql, etc
 dependencies = [
  "azure-ai-documentintelligence>=1.0.2",
  "babel>=2.17",
  "bleach~=6.2.0",
  "celery[redis]~=5.5.1",
@@ -233,6 +234,7 @@ testpaths = [
  "src/paperless_tesseract/tests/",
  "src/paperless_tika/tests",
  "src/paperless_text/tests/",
  "src/paperless_remote/tests/",
 ]
 addopts = [
  "--pythonwarnings=all",
--- a/src-ui/src/app/components/manage/tag-list/tag-list.component.spec.ts
+++ b/src-ui/src/app/components/manage/tag-list/tag-list.component.spec.ts
@@ -71,4 +71,20 @@ describe('TagListComponent', () => {
      'Do you really want to delete the tag "Tag1"?'
    )
  })
  it('should filter out child tags if name filter is empty, otherwise show all', () => {
    const tags = [
      { id: 1, name: 'Tag1', parent: null },
      { id: 2, name: 'Tag2', parent: 1 },
      { id: 3, name: 'Tag3', parent: null },
    ]
    component['_nameFilter'] = null // Simulate empty name filter
    const filtered = component.filterData(tags as any)
    expect(filtered.length).toBe(2)
    expect(filtered.find((t) => t.id === 2)).toBeUndefined()
    component['_nameFilter'] = 'Tag2' // Simulate non-empty name filter
    const filteredWithName = component.filterData(tags as any)
    expect(filteredWithName.length).toBe(3)
  })
 })
--- a/src-ui/src/app/components/manage/tag-list/tag-list.component.ts
+++ b/src-ui/src/app/components/manage/tag-list/tag-list.component.ts
@@ -62,6 +62,8 @@ export class TagListComponent extends ManagementListComponent<Tag> {
  }
  filterData(data: Tag[]) {
-    return data.filter((tag) => !tag.parent)
+    return this.nameFilter?.length
      ? [...data]
      : data.filter((tag) => !tag.parent)
  }
 }
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@@ -82,6 +82,13 @@ def _is_ignored(filepath: Path) -> bool:
 def _consume(filepath: Path) -> None:
    # Check permissions early
    try:
        filepath.stat()
    except (PermissionError, OSError):
        logger.warning(f"Not consuming file {filepath}: Permission denied.")
        return
    if filepath.is_dir() or _is_ignored(filepath):
        return
@@ -323,7 +330,12 @@ class Command(BaseCommand):
                        # Also make sure the file exists still, some scanners might write a
                        # temporary file first
                        try:
                            file_still_exists = filepath.exists() and filepath.is_file()
                        except (PermissionError, OSError):  # pragma: no cover
                            # If we can't check, let it fail in the _consume function
                            file_still_exists = True
                            continue
                        if waited_long_enough and file_still_exists:
                            _consume(filepath)
--- a/src/documents/tests/test_api_app_config.py
+++ b/src/documents/tests/test_api_app_config.py
@@ -1,6 +1,4 @@
 import json
 from fractions import Fraction
 from io import BytesIO
 from pathlib import Path
 from django.contrib.auth.models import User
@@ -8,11 +6,6 @@ from django.core.files.uploadedfile import SimpleUploadedFile
 from rest_framework import status
 from rest_framework.test import APITestCase
 try:
    from PIL import Image
 except ModuleNotFoundError:  # pragma: no cover - Pillow is required in production
    Image = None  # type: ignore[assignment]
 from documents.tests.utils import DirectoriesMixin
 from paperless.models import ApplicationConfiguration
 from paperless.models import ColorConvertChoices
@@ -197,74 +190,6 @@ class TestApiAppConfig(DirectoriesMixin, APITestCase):
        )
        self.assertFalse(Path(old_logo.path).exists())
    def test_api_strips_metadata_from_logo_upload(self):
        """
        GIVEN:
            - An image file containing EXIF metadata including GPS coordinates
        WHEN:
            - Uploaded via PATCH to app config
        THEN:
            - Stored logo no longer contains EXIF metadata
        """
        if Image is None:
            self.skipTest("Pillow is not installed")
        if not hasattr(Image, "Exif"):
            self.skipTest("Current Pillow version cannot create EXIF metadata")
        assert Image is not None
        exif = Image.Exif()
        exif[0x010E] = "Test description"  # ImageDescription
        exif[0x8825] = {
            1: "N",  # GPSLatitudeRef
            2: (Fraction(51, 1), Fraction(30, 1), Fraction(0, 1)),
            3: "E",  # GPSLongitudeRef
            4: (Fraction(0, 1), Fraction(7, 1), Fraction(0, 1)),
        }
        buffer = BytesIO()
        Image.new("RGB", (8, 8), "white").save(buffer, format="JPEG", exif=exif)
        buffer.seek(0)
        with Image.open(BytesIO(buffer.getvalue())) as uploaded_image:
            self.assertGreater(len(uploaded_image.getexif()), 0)
        response = self.client.patch(
            f"{self.ENDPOINT}1/",
            {
                "app_logo": SimpleUploadedFile(
                    name="with_exif.jpg",
                    content=buffer.getvalue(),
                    content_type="image/jpeg",
                ),
            },
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        config = ApplicationConfiguration.objects.first()
        stored_logo = Path(config.app_logo.path)
        self.assertTrue(stored_logo.exists())
        with Image.open(stored_logo) as sanitized:
            sanitized_exif = sanitized.getexif()
            self.assertNotEqual(sanitized_exif.get(0x010E), "Test description")
            gps_ifd = None
            if hasattr(sanitized_exif, "get_ifd"):
                try:
                    gps_ifd = sanitized_exif.get_ifd(0x8825)
                except KeyError:
                    gps_ifd = None
            else:
                gps_ifd = sanitized_exif.get(0x8825)
            if gps_ifd is not None:
                self.assertEqual(len(gps_ifd), 0, "GPS metadata should be cleared")
            self.assertNotIn("exif", sanitized.info)
    def test_api_rejects_malicious_svg_logo(self):
        """
        GIVEN:
--- a/src/documents/tests/test_management_consumer.py
+++ b/src/documents/tests/test_management_consumer.py
@@ -209,6 +209,26 @@ class TestConsumer(DirectoriesMixin, ConsumerThreadMixin, TransactionTestCase):
        # assert that we have an error logged with this invalid file.
        error_logger.assert_called_once()
    @mock.patch("documents.management.commands.document_consumer.logger.warning")
    def test_permission_error_on_prechecks(self, warning_logger):
        filepath = Path(self.dirs.consumption_dir) / "selinux.txt"
        filepath.touch()
        original_stat = Path.stat
        def raising_stat(self, *args, **kwargs):
            if self == filepath:
                raise PermissionError("Permission denied")
            return original_stat(self, *args, **kwargs)
        with mock.patch("pathlib.Path.stat", new=raising_stat):
            document_consumer._consume(filepath)
        warning_logger.assert_called_once()
        (args, _) = warning_logger.call_args
        self.assertIn("Permission denied", args[0])
        self.consume_file_mock.assert_not_called()
    @override_settings(CONSUMPTION_DIR="does_not_exist")
    def test_consumption_directory_invalid(self):
        self.assertRaises(CommandError, call_command, "document_consumer", "--oneshot")
--- a/src/paperless/serialisers.py
+++ b/src/paperless/serialisers.py
@@ -1,5 +1,4 @@
 import logging
 from io import BytesIO
 import magic
 from allauth.mfa.adapter import get_adapter as get_mfa_adapter
@@ -10,10 +9,6 @@ from allauth.socialaccount.models import SocialApp
 from django.contrib.auth.models import Group
 from django.contrib.auth.models import Permission
 from django.contrib.auth.models import User
 from django.core.files.uploadedfile import SimpleUploadedFile
 from PIL import Image
 from PIL import ImageOps
 from PIL import UnidentifiedImageError
 from rest_framework import serializers
 from rest_framework.authtoken.serializers import AuthTokenSerializer
@@ -24,102 +19,6 @@ from paperless_mail.serialisers import ObfuscatedPasswordField
 logger = logging.getLogger("paperless.settings")
 def strip_image_metadata(uploaded_file, mime_type: str | None):
    """Return a copy of ``uploaded_file`` with EXIF/ICC metadata removed."""
    if uploaded_file is None:
        return uploaded_file
    original_position = uploaded_file.tell() if hasattr(uploaded_file, "tell") else None
    image = None
    sanitized = None
    try:
        if hasattr(uploaded_file, "seek"):
            uploaded_file.seek(0)
        image = Image.open(uploaded_file)
        image.load()
    except (UnidentifiedImageError, OSError):
        if hasattr(uploaded_file, "seek") and original_position is not None:
            uploaded_file.seek(original_position)
        return uploaded_file
    try:
        image_format = (image.format or "").upper()
        image = ImageOps.exif_transpose(image)
        if image_format not in {"JPEG", "JPG", "PNG"}:
            if hasattr(uploaded_file, "seek") and original_position is not None:
                uploaded_file.seek(original_position)
            return uploaded_file
        if hasattr(image, "info"):
            image.info.pop("exif", None)
            image.info.pop("icc_profile", None)
            image.info.pop("comment", None)
        if image_format in {"JPEG", "JPG"}:
            sanitized = image.convert("RGB")
            save_kwargs = {
                "format": "JPEG",
                "quality": 95,
                "subsampling": 0,
                "optimize": True,
                "exif": b"",
            }
        else:  # PNG
            target_mode = (
                "RGBA"
                if ("A" in image.mode or image.info.get("transparency"))
                else "RGB"
            )
            sanitized = image.convert(target_mode)
            save_kwargs = {
                "format": "PNG",
                "optimize": True,
            }
        buffer = BytesIO()
        try:
            sanitized.save(buffer, **save_kwargs)
        except (OSError, ValueError):
            buffer = BytesIO()
            if image_format in {"JPEG", "JPG"}:
                sanitized.save(
                    buffer,
                    format="JPEG",
                    quality=90,
                    subsampling=0,
                    exif=b"",
                )
            else:
                sanitized.save(
                    buffer,
                    format="PNG",
                )
        buffer.seek(0)
        if hasattr(uploaded_file, "close"):
            try:
                uploaded_file.close()
            except Exception:
                pass
        content_type = getattr(uploaded_file, "content_type", None) or mime_type
        return SimpleUploadedFile(
            name=getattr(uploaded_file, "name", "logo"),
            content=buffer.getvalue(),
            content_type=content_type,
        )
    finally:
        if sanitized is not None:
            sanitized.close()
        if image is not None:
            image.close()
 class PaperlessAuthTokenSerializer(AuthTokenSerializer):
    code = serializers.CharField(
        label="MFA Code",
@@ -310,23 +209,10 @@ class ApplicationConfigurationSerializer(serializers.ModelSerializer):
        return super().update(instance, validated_data)
    def validate_app_logo(self, file):
-        if not file:
+        if file and magic.from_buffer(file.read(2048), mime=True) == "image/svg+xml":
            return file
        if hasattr(file, "seek"):
            file.seek(0)
        mime_type = magic.from_buffer(file.read(2048), mime=True)
        if hasattr(file, "seek"):
            file.seek(0)
        if mime_type == "image/svg+xml":
            reject_dangerous_svg(file)
            if hasattr(file, "seek"):
                file.seek(0)
        return file
        return strip_image_metadata(file, mime_type)
    class Meta:
        model = ApplicationConfiguration
        fields = "__all__"
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -322,6 +322,7 @@ INSTALLED_APPS = [
    "paperless_tesseract.apps.PaperlessTesseractConfig",
    "paperless_text.apps.PaperlessTextConfig",
    "paperless_mail.apps.PaperlessMailConfig",
    "paperless_remote.apps.PaperlessRemoteParserConfig",
    "django.contrib.admin",
    "rest_framework",
    "rest_framework.authtoken",
@@ -1389,3 +1390,10 @@ WEBHOOKS_ALLOW_INTERNAL_REQUESTS = __get_boolean(
    "PAPERLESS_WEBHOOKS_ALLOW_INTERNAL_REQUESTS",
    "true",
 )
 ###############################################################################
 # Remote Parser                                                               #
 ###############################################################################
 REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
 REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
 REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")
--- a/src/paperless_remote/init.py
+++ b/src/paperless_remote/init.py
@@ -0,0 +1,4 @@
 # this is here so that django finds the checks.
 from paperless_remote.checks import check_remote_parser_configured
 __all__ = ["check_remote_parser_configured"]
--- a/src/paperless_remote/apps.py
+++ b/src/paperless_remote/apps.py
@@ -0,0 +1,14 @@
 from django.apps import AppConfig
 from paperless_remote.signals import remote_consumer_declaration
 class PaperlessRemoteParserConfig(AppConfig):
    name = "paperless_remote"
    def ready(self):
        from documents.signals import document_consumer_declaration
        document_consumer_declaration.connect(remote_consumer_declaration)
        AppConfig.ready(self)
--- a/src/paperless_remote/checks.py
+++ b/src/paperless_remote/checks.py
@@ -0,0 +1,17 @@
 from django.conf import settings
 from django.core.checks import Error
 from django.core.checks import register
@register()
 def check_remote_parser_configured(app_configs, **kwargs):
    if settings.REMOTE_OCR_ENGINE == "azureai" and not (
        settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
    ):
        return [
            Error(
                "Azure AI remote parser requires endpoint and API key to be configured.",
            ),
        ]
    return []
--- a/src/paperless_remote/parsers.py
+++ b/src/paperless_remote/parsers.py
@@ -0,0 +1,113 @@
 from pathlib import Path
 from django.conf import settings
 from paperless_tesseract.parsers import RasterisedDocumentParser
 class RemoteEngineConfig:
    def __init__(
        self,
        engine: str,
        api_key: str | None = None,
        endpoint: str | None = None,
    ):
        self.engine = engine
        self.api_key = api_key
        self.endpoint = endpoint
    def engine_is_valid(self):
        valid = self.engine in ["azureai"] and self.api_key is not None
        if self.engine == "azureai":
            valid = valid and self.endpoint is not None
        return valid
 class RemoteDocumentParser(RasterisedDocumentParser):
    """
    This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
    as this is the only service that provides a remote OCR API with text-embedded PDF output.
    """
    logging_name = "paperless.parsing.remote"
    def get_settings(self) -> RemoteEngineConfig:
        """
        Returns the configuration for the remote OCR engine, loaded from Django settings.
        """
        return RemoteEngineConfig(
            engine=settings.REMOTE_OCR_ENGINE,
            api_key=settings.REMOTE_OCR_API_KEY,
            endpoint=settings.REMOTE_OCR_ENDPOINT,
        )
    def supported_mime_types(self):
        if self.settings.engine_is_valid():
            return {
                "application/pdf": ".pdf",
                "image/png": ".png",
                "image/jpeg": ".jpg",
                "image/tiff": ".tiff",
                "image/bmp": ".bmp",
                "image/gif": ".gif",
                "image/webp": ".webp",
            }
        else:
            return {}
    def azure_ai_vision_parse(
        self,
        file: Path,
    ) -> str | None:
        """
        Uses Azure AI Vision to parse the document and return the text content.
        It requests a searchable PDF output with embedded text.
        The PDF is saved to the archive_path attribute.
        Returns the text content extracted from the document.
        If the parsing fails, it returns None.
        """
        from azure.ai.documentintelligence import DocumentIntelligenceClient
        from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
        from azure.ai.documentintelligence.models import AnalyzeOutputOption
        from azure.ai.documentintelligence.models import DocumentContentFormat
        from azure.core.credentials import AzureKeyCredential
        client = DocumentIntelligenceClient(
            endpoint=self.settings.endpoint,
            credential=AzureKeyCredential(self.settings.api_key),
        )
        with file.open("rb") as f:
            analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
            poller = client.begin_analyze_document(
                model_id="prebuilt-read",
                body=analyze_request,
                output_content_format=DocumentContentFormat.TEXT,
                output=[AnalyzeOutputOption.PDF],  # request searchable PDF output
                content_type="application/json",
            )
        poller.wait()
        result_id = poller.details["operation_id"]
        result = poller.result()
        # Download the PDF with embedded text
        self.archive_path = self.tempdir / "archive.pdf"
        with self.archive_path.open("wb") as f:
            for chunk in client.get_analyze_result_pdf(
                model_id="prebuilt-read",
                result_id=result_id,
            ):
                f.write(chunk)
        client.close()
        return result.content
    def parse(self, document_path: Path, mime_type, file_name=None):
        if not self.settings.engine_is_valid():
            self.log.warning(
                "No valid remote parser engine is configured, content will be empty.",
            )
            self.text = ""
        elif self.settings.engine == "azureai":
            self.text = self.azure_ai_vision_parse(document_path)
--- a/src/paperless_remote/signals.py
+++ b/src/paperless_remote/signals.py
@@ -0,0 +1,18 @@
 def get_parser(*args, **kwargs):
    from paperless_remote.parsers import RemoteDocumentParser
    return RemoteDocumentParser(*args, **kwargs)
 def get_supported_mime_types():
    from paperless_remote.parsers import RemoteDocumentParser
    return RemoteDocumentParser(None).supported_mime_types()
 def remote_consumer_declaration(sender, **kwargs):
    return {
        "parser": get_parser,
        "weight": 5,
        "mime_types": get_supported_mime_types(),
    }
--- a/src/paperless_remote/tests/init.py
+++ b/src/paperless_remote/tests/init.py
--- a/src/paperless_remote/tests/samples/simple-digital.pdf
+++ b/src/paperless_remote/tests/samples/simple-digital.pdf
--- a/src/paperless_remote/tests/test_checks.py
+++ b/src/paperless_remote/tests/test_checks.py
@@ -0,0 +1,24 @@
 from unittest import TestCase
 from django.test import override_settings
 from paperless_remote import check_remote_parser_configured
 class TestChecks(TestCase):
    @override_settings(REMOTE_OCR_ENGINE=None)
    def test_no_engine(self):
        msgs = check_remote_parser_configured(None)
        self.assertEqual(len(msgs), 0)
    @override_settings(REMOTE_OCR_ENGINE="azureai")
    @override_settings(REMOTE_OCR_API_KEY="somekey")
    @override_settings(REMOTE_OCR_ENDPOINT=None)
    def test_azure_no_endpoint(self):
        msgs = check_remote_parser_configured(None)
        self.assertEqual(len(msgs), 1)
        self.assertTrue(
            msgs[0].msg.startswith(
                "Azure AI remote parser requires endpoint and API key to be configured.",
            ),
        )
--- a/src/paperless_remote/tests/test_parser.py
+++ b/src/paperless_remote/tests/test_parser.py
@@ -0,0 +1,101 @@
 import uuid
 from pathlib import Path
 from unittest import mock
 from django.test import TestCase
 from django.test import override_settings
 from documents.tests.utils import DirectoriesMixin
 from documents.tests.utils import FileSystemAssertsMixin
 from paperless_remote.parsers import RemoteDocumentParser
 from paperless_remote.signals import get_parser
 class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
    SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
    def assertContainsStrings(self, content: str, strings: list[str]):
        # Asserts that all strings appear in content, in the given order.
        indices = []
        for s in strings:
            if s in content:
                indices.append(content.index(s))
            else:
                self.fail(f"'{s}' is not in '{content}'")
        self.assertListEqual(indices, sorted(indices))
    @mock.patch("paperless_tesseract.parsers.run_subprocess")
    @mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
    def test_get_text_with_azure(self, mock_client_cls, mock_subprocess):
        # Arrange mock Azure client
        mock_client = mock.Mock()
        mock_client_cls.return_value = mock_client
        # Simulate poller result and its `.details`
        mock_poller = mock.Mock()
        mock_poller.wait.return_value = None
        mock_poller.details = {"operation_id": "fake-op-id"}
        mock_client.begin_analyze_document.return_value = mock_poller
        mock_poller.result.return_value.content = "This is a test document."
        # Return dummy PDF bytes
        mock_client.get_analyze_result_pdf.return_value = [
            b"%PDF-",
            b"1.7 ",
            b"FAKEPDF",
        ]
        # Simulate pdftotext by writing dummy text to sidecar file
        def fake_run(cmd, *args, **kwargs):
            with Path(cmd[-1]).open("w", encoding="utf-8") as f:
                f.write("This is a test document.")
        mock_subprocess.side_effect = fake_run
        with override_settings(
            REMOTE_OCR_ENGINE="azureai",
            REMOTE_OCR_API_KEY="somekey",
            REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
        ):
            parser = get_parser(uuid.uuid4())
            parser.parse(
                self.SAMPLE_FILES / "simple-digital.pdf",
                "application/pdf",
            )
            self.assertContainsStrings(
                parser.text.strip(),
                ["This is a test document."],
            )
    @override_settings(
        REMOTE_OCR_ENGINE="azureai",
        REMOTE_OCR_API_KEY="key",
        REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
    )
    def test_supported_mime_types_valid_config(self):
        parser = RemoteDocumentParser(uuid.uuid4())
        expected_types = {
            "application/pdf": ".pdf",
            "image/png": ".png",
            "image/jpeg": ".jpg",
            "image/tiff": ".tiff",
            "image/bmp": ".bmp",
            "image/gif": ".gif",
            "image/webp": ".webp",
        }
        self.assertEqual(parser.supported_mime_types(), expected_types)
    def test_supported_mime_types_invalid_config(self):
        parser = get_parser(uuid.uuid4())
        self.assertEqual(parser.supported_mime_types(), {})
    @override_settings(
        REMOTE_OCR_ENGINE=None,
        REMOTE_OCR_API_KEY=None,
        REMOTE_OCR_ENDPOINT=None,
    )
    def test_parse_with_invalid_config(self):
        parser = get_parser(uuid.uuid4())
        parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf")
        self.assertEqual(parser.text, "")
--- a/uv.lock
+++ b/uv.lock
@@ -95,6 +95,34 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/af/cc/55a32a2c98022d88812b5986d2a92c4ff3ee087e83b712ebc703bba452bf/Automat-24.8.1-py3-none-any.whl", hash = "sha256:bf029a7bc3da1e2c24da2343e7598affaa9f10bf0ab63ff808566ce90551e02a", size = 42585, upload-time = "2024-08-19T17:31:56.729Z" },
 ]
 [[package]]
 name = "azure-ai-documentintelligence"
 version = "1.0.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "azure-core", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "isodate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/44/7b/8115cd713e2caa5e44def85f2b7ebd02a74ae74d7113ba20bdd41fd6dd80/azure_ai_documentintelligence-1.0.2.tar.gz", hash = "sha256:4d75a2513f2839365ebabc0e0e1772f5601b3a8c9a71e75da12440da13b63484", size = 170940 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/d9/75/c9ec040f23082f54ffb1977ff8f364c2d21c79a640a13d1c1809e7fd6b1a/azure_ai_documentintelligence-1.0.2-py3-none-any.whl", hash = "sha256:e1fb446abbdeccc9759d897898a0fe13141ed29f9ad11fc705f951925822ed59", size = 106005 },
 ]
 [[package]]
 name = "azure-core"
 version = "1.33.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "six", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/75/aa/7c9db8edd626f1a7d99d09ef7926f6f4fb34d5f9fa00dc394afdfe8e2a80/azure_core-1.33.0.tar.gz", hash = "sha256:f367aa07b5e3005fec2c1e184b882b0b039910733907d001c20fb08ebb8c0eb9", size = 295633 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/07/b7/76b7e144aa53bd206bf1ce34fa75350472c3f69bf30e5c8c18bc9881035d/azure_core-1.33.0-py3-none-any.whl", hash = "sha256:9b5b6d0223a1d38c37500e6971118c1e0f13f54951e6893968b38910bc9cda8f", size = 207071 },
 ]
 [[package]]
 name = "babel"
 version = "2.17.0"
@@ -1412,6 +1440,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/c7/fc/4e5a141c3f7c7bed550ac1f69e599e92b6be449dd4677ec09f325cad0955/inotifyrecursive-0.3.5-py3-none-any.whl", hash = "sha256:7e5f4a2e1dc2bef0efa3b5f6b339c41fb4599055a2b54909d020e9e932cc8d2f", size = 8009, upload-time = "2020-11-20T12:38:46.981Z" },
 ]
 [[package]]
 name = "isodate"
 version = "0.7.2"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320 },
 ]
 [[package]]
 name = "jinja2"
 version = "3.1.6"
@@ -2032,6 +2069,7 @@ name = "paperless-ngx"
 version = "2.18.4"
 source = { virtual = "." }
 dependencies = [
    { name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "babel", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
    { name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -2169,6 +2207,7 @@ typing = [
 [package.metadata]
 requires-dist = [
    { name = "azure-ai-documentintelligence", specifier = ">=1.0.2" },
    { name = "babel", specifier = ">=2.17" },
    { name = "bleach", specifier = "~=6.2.0" },
    { name = "celery", extras = ["redis"], specifier = "~=5.5.1" },
Author	SHA1	Message	Date
shamoon	8e0d574e99	Merge branch 'dev' into feature-remote-ocr-2	2025-09-21 16:18:13 -07:00
shamoon	4449dbadb5	Merge branch 'main' into dev	2025-09-21 16:10:00 -07:00
shamoon	0e35acaef5	Fix: add extra error handling to _consume for file checks (#10897 )	2025-09-21 13:21:40 -07:00
shamoon	19ff339804	Fix: show children in tag list when filtering (#10899 )	2025-09-21 10:09:05 -07:00
shamoon	8a5820328e	Sonar suggestions	2025-09-17 19:18:47 -07:00
shamoon	809d62a2f4	Merge branch 'dev' into feature-remote-ocr-2	2025-09-17 16:51:23 -07:00
shamoon	0d87f94b9b	Merge branch 'dev' into feature-remote-ocr-2	2025-09-14 14:01:35 -07:00
shamoon	315b90f8e5	Add typing to assertContainsStrings test util	2025-09-11 13:56:14 -07:00
shamoon	47b2d2964b	Use regular testcase instead of django, config check test	2025-09-11 13:52:10 -07:00
shamoon	e05639ae4e	tempdir already a path	2025-09-11 13:49:30 -07:00
shamoon	f400a8cb2f	Close client	2025-09-11 13:49:06 -07:00
shamoon	26abcf5612	Also ensure API key is set	2025-09-11 13:48:06 -07:00
shamoon	afde52430d	Merge branch 'dev' into feature-remote-ocr-2	2025-09-11 13:25:53 -07:00
shamoon	716f2da652	Merge branch 'dev' into feature-remote-ocr-2	2025-09-08 11:36:49 -07:00
shamoon	c54073b7c2	Merge branch 'dev' into feature-remote-ocr-2	2025-09-04 09:16:59 -07:00
shamoon	247e6f39dc	Merge branch 'dev' into feature-remote-ocr-2	2025-09-01 20:10:40 -07:00
shamoon	1e6dfc4481	Merge branch 'dev' into feature-remote-ocr-2	2025-08-26 13:30:39 -07:00
shamoon	7cc0750066	Add note on costs and limitations for Azure OCR	2025-08-24 05:47:07 -07:00
shamoon	bd6585d3b4	Merge branch 'dev' into feature-remote-ocr-2	2025-08-22 08:54:26 -07:00
shamoon	717e828a1d	Merge branch 'dev' into feature-remote-ocr-2	2025-08-17 21:25:14 -07:00
shamoon	07381d48e6	Merge branch 'dev' into feature-remote-ocr-2	2025-08-17 07:49:58 -07:00
shamoon	dd0ffaf312	Merge branch 'dev' into feature-remote-ocr-2	2025-08-11 10:48:36 -07:00
shamoon	264504affc	Fix consumer declaration file extensions	2025-08-10 05:32:52 -07:00
shamoon	4feedf2add	Merge branch 'dev' into feature-remote-ocr-2	2025-08-06 16:04:25 -04:00
shamoon	2f76cf9831	Merge branch 'dev' into feature-remote-ocr-2	2025-08-01 23:55:49 -04:00
shamoon	1002d37f6b	Update test_parser.py	2025-07-09 11:05:37 -07:00
shamoon	d260a94740	Update parsers.py	2025-07-09 11:02:57 -07:00
shamoon	88c69b83ea	Update index.md	2025-07-09 11:00:12 -07:00
shamoon	2557ee2014	Update docs to mention remote OCR with Azure AI	2025-07-09 09:53:30 -07:00
shamoon	3c75deed80	Add paperless_remote tests to testpaths	2025-07-08 14:19:45 -07:00
shamoon	d05343c927	Test fixes / coverage	2025-07-08 14:19:45 -07:00
shamoon	e7972b7eaf	Coverage	2025-07-08 14:19:45 -07:00
shamoon	75a091cc0d	Fix test	2025-07-08 14:19:44 -07:00
shamoon	dca74803fd	Use output_content_format poller.result to get clean content	2025-07-08 14:19:44 -07:00
shamoon	3cf3d868d0	Some docs	2025-07-08 14:19:43 -07:00
shamoon	bf4fc6604a	Test	2025-07-08 14:19:43 -07:00
shamoon	e8c1eb86fa	This actually works [ci skip]	2025-07-08 14:19:43 -07:00
shamoon	c3dad3cf69	Basic parse	2025-07-08 14:19:42 -07:00
shamoon	811bd66088	Ok, restart implementing this with just azure [ci skip]	2025-07-08 14:19:42 -07:00