Merge branch 'dev' into feature-ai

This commit is contained in:
shamoon
2026-01-08 13:59:51 -08:00
47 changed files with 1582 additions and 196 deletions

View File

@@ -646,6 +646,77 @@ def edit_pdf(
return "OK"
def remove_password(
doc_ids: list[int],
password: str,
*,
update_document: bool = False,
delete_original: bool = False,
include_metadata: bool = True,
user: User | None = None,
) -> Literal["OK"]:
"""
Remove password protection from PDF documents.
"""
import pikepdf
for doc_id in doc_ids:
doc = Document.objects.get(id=doc_id)
try:
logger.info(
f"Attempting password removal from document {doc_ids[0]}",
)
with pikepdf.open(doc.source_path, password=password) as pdf:
temp_path = doc.source_path.with_suffix(".tmp.pdf")
pdf.remove_unreferenced_resources()
pdf.save(temp_path)
if update_document:
# replace the original document with the unprotected one
temp_path.replace(doc.source_path)
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
doc.page_count = len(pdf.pages)
doc.save()
update_document_content_maybe_archive_file.delay(document_id=doc.id)
else:
consume_tasks = []
overrides = (
DocumentMetadataOverrides().from_document(doc)
if include_metadata
else DocumentMetadataOverrides()
)
if user is not None:
overrides.owner_id = user.id
filepath: Path = (
Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR))
/ f"{doc.id}_unprotected.pdf"
)
temp_path.replace(filepath)
consume_tasks.append(
consume_file.s(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=filepath,
),
overrides,
),
)
if delete_original:
chord(header=consume_tasks, body=delete.si([doc.id])).delay()
else:
group(consume_tasks).delay()
except Exception as e:
logger.exception(f"Error removing password from document {doc.id}: {e}")
raise ValueError(
f"An error occurred while removing the password: {e}",
) from e
return "OK"
def reflect_doclinks(
document: Document,
field: CustomField,

View File

@@ -10,6 +10,7 @@ from datetime import time
from datetime import timedelta
from datetime import timezone
from shutil import rmtree
from time import sleep
from typing import TYPE_CHECKING
from typing import Literal
@@ -32,6 +33,7 @@ from whoosh.highlight import HtmlFormatter
from whoosh.idsets import BitSet
from whoosh.idsets import DocIdSet
from whoosh.index import FileIndex
from whoosh.index import LockError
from whoosh.index import create_in
from whoosh.index import exists_in
from whoosh.index import open_dir
@@ -97,11 +99,33 @@ def get_schema() -> Schema:
def open_index(*, recreate=False) -> FileIndex:
try:
if exists_in(settings.INDEX_DIR) and not recreate:
return open_dir(settings.INDEX_DIR, schema=get_schema())
except Exception:
logger.exception("Error while opening the index, recreating.")
transient_exceptions = (FileNotFoundError, LockError)
max_retries = 3
retry_delay = 0.1
for attempt in range(max_retries + 1):
try:
if exists_in(settings.INDEX_DIR) and not recreate:
return open_dir(settings.INDEX_DIR, schema=get_schema())
break
except transient_exceptions as exc:
is_last_attempt = attempt == max_retries or recreate
if is_last_attempt:
logger.exception(
"Error while opening the index after retries, recreating.",
)
break
logger.warning(
"Transient error while opening the index (attempt %s/%s): %s. Retrying.",
attempt + 1,
max_retries + 1,
exc,
)
sleep(retry_delay)
except Exception:
logger.exception("Error while opening the index, recreating.")
break
# create_in doesn't handle corrupted indexes very well, remove the directory entirely first
if settings.INDEX_DIR.is_dir():

View File

@@ -1430,6 +1430,7 @@ class BulkEditSerializer(
"split",
"delete_pages",
"edit_pdf",
"remove_password",
],
label="Method",
write_only=True,
@@ -1505,6 +1506,8 @@ class BulkEditSerializer(
return bulk_edit.delete_pages
elif method == "edit_pdf":
return bulk_edit.edit_pdf
elif method == "remove_password":
return bulk_edit.remove_password
else: # pragma: no cover
# This will never happen as it is handled by the ChoiceField
raise serializers.ValidationError("Unsupported method.")
@@ -1701,6 +1704,12 @@ class BulkEditSerializer(
f"Page {op['page']} is out of bounds for document with {doc.page_count} pages.",
)
def validate_parameters_remove_password(self, parameters):
if "password" not in parameters:
raise serializers.ValidationError("password not specified")
if not isinstance(parameters["password"], str):
raise serializers.ValidationError("password must be a string")
def validate(self, attrs):
method = attrs["method"]
parameters = attrs["parameters"]
@@ -1741,6 +1750,8 @@ class BulkEditSerializer(
"Edit PDF method only supports one document",
)
self._validate_parameters_edit_pdf(parameters, attrs["documents"][0])
elif method == bulk_edit.remove_password:
self.validate_parameters_remove_password(parameters)
return attrs

View File

@@ -508,7 +508,7 @@ def check_scheduled_workflows():
trigger.schedule_is_recurring
and workflow_runs.exists()
and (
workflow_runs.last().run_at
workflow_runs.first().run_at
> now
- datetime.timedelta(
days=trigger.schedule_recurring_interval_days,

View File

@@ -1582,6 +1582,58 @@ class TestBulkEditAPI(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertIn(b"out of bounds", response.content)
@mock.patch("documents.serialisers.bulk_edit.remove_password")
def test_remove_password(self, m):
self.setup_mock(m, "remove_password")
response = self.client.post(
"/api/documents/bulk_edit/",
json.dumps(
{
"documents": [self.doc2.id],
"method": "remove_password",
"parameters": {"password": "secret", "update_document": True},
},
),
content_type="application/json",
)
self.assertEqual(response.status_code, status.HTTP_200_OK)
m.assert_called_once()
args, kwargs = m.call_args
self.assertCountEqual(args[0], [self.doc2.id])
self.assertEqual(kwargs["password"], "secret")
self.assertTrue(kwargs["update_document"])
self.assertEqual(kwargs["user"], self.user)
def test_remove_password_invalid_params(self):
response = self.client.post(
"/api/documents/bulk_edit/",
json.dumps(
{
"documents": [self.doc2.id],
"method": "remove_password",
"parameters": {},
},
),
content_type="application/json",
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertIn(b"password not specified", response.content)
response = self.client.post(
"/api/documents/bulk_edit/",
json.dumps(
{
"documents": [self.doc2.id],
"method": "remove_password",
"parameters": {"password": 123},
},
),
content_type="application/json",
)
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
self.assertIn(b"password must be a string", response.content)
@override_settings(AUDIT_LOG_ENABLED=True)
def test_bulk_edit_audit_log_enabled_simple_field(self):
"""

View File

@@ -1,3 +1,4 @@
import hashlib
import shutil
from datetime import date
from pathlib import Path
@@ -1066,3 +1067,147 @@ class TestPDFActions(DirectoriesMixin, TestCase):
bulk_edit.edit_pdf(doc_ids, operations, update_document=True)
mock_group.assert_not_called()
mock_consume_file.assert_not_called()
@mock.patch("documents.bulk_edit.update_document_content_maybe_archive_file.delay")
@mock.patch("pikepdf.open")
def test_remove_password_update_document(self, mock_open, mock_update_document):
doc = self.doc1
original_checksum = doc.checksum
fake_pdf = mock.MagicMock()
fake_pdf.pages = [mock.Mock(), mock.Mock(), mock.Mock()]
def save_side_effect(target_path):
Path(target_path).write_bytes(b"new pdf content")
fake_pdf.save.side_effect = save_side_effect
mock_open.return_value.__enter__.return_value = fake_pdf
result = bulk_edit.remove_password(
[doc.id],
password="secret",
update_document=True,
)
self.assertEqual(result, "OK")
mock_open.assert_called_once_with(doc.source_path, password="secret")
fake_pdf.remove_unreferenced_resources.assert_called_once()
doc.refresh_from_db()
self.assertNotEqual(doc.checksum, original_checksum)
expected_checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
self.assertEqual(doc.checksum, expected_checksum)
self.assertEqual(doc.page_count, len(fake_pdf.pages))
mock_update_document.assert_called_once_with(document_id=doc.id)
@mock.patch("documents.bulk_edit.chord")
@mock.patch("documents.bulk_edit.group")
@mock.patch("documents.tasks.consume_file.s")
@mock.patch("documents.bulk_edit.tempfile.mkdtemp")
@mock.patch("pikepdf.open")
def test_remove_password_creates_consumable_document(
self,
mock_open,
mock_mkdtemp,
mock_consume_file,
mock_group,
mock_chord,
):
doc = self.doc2
temp_dir = self.dirs.scratch_dir / "remove-password"
temp_dir.mkdir(parents=True, exist_ok=True)
mock_mkdtemp.return_value = str(temp_dir)
fake_pdf = mock.MagicMock()
fake_pdf.pages = [mock.Mock(), mock.Mock()]
def save_side_effect(target_path):
Path(target_path).write_bytes(b"password removed")
fake_pdf.save.side_effect = save_side_effect
mock_open.return_value.__enter__.return_value = fake_pdf
mock_group.return_value.delay.return_value = None
user = User.objects.create(username="owner")
result = bulk_edit.remove_password(
[doc.id],
password="secret",
include_metadata=False,
update_document=False,
delete_original=False,
user=user,
)
self.assertEqual(result, "OK")
mock_open.assert_called_once_with(doc.source_path, password="secret")
mock_consume_file.assert_called_once()
consume_args, _ = mock_consume_file.call_args
consumable_document = consume_args[0]
overrides = consume_args[1]
expected_path = temp_dir / f"{doc.id}_unprotected.pdf"
self.assertTrue(expected_path.exists())
self.assertEqual(
Path(consumable_document.original_file).resolve(),
expected_path.resolve(),
)
self.assertEqual(overrides.owner_id, user.id)
mock_group.assert_called_once_with([mock_consume_file.return_value])
mock_group.return_value.delay.assert_called_once()
mock_chord.assert_not_called()
@mock.patch("documents.bulk_edit.delete")
@mock.patch("documents.bulk_edit.chord")
@mock.patch("documents.bulk_edit.group")
@mock.patch("documents.tasks.consume_file.s")
@mock.patch("documents.bulk_edit.tempfile.mkdtemp")
@mock.patch("pikepdf.open")
def test_remove_password_deletes_original(
self,
mock_open,
mock_mkdtemp,
mock_consume_file,
mock_group,
mock_chord,
mock_delete,
):
doc = self.doc2
temp_dir = self.dirs.scratch_dir / "remove-password-delete"
temp_dir.mkdir(parents=True, exist_ok=True)
mock_mkdtemp.return_value = str(temp_dir)
fake_pdf = mock.MagicMock()
fake_pdf.pages = [mock.Mock(), mock.Mock()]
def save_side_effect(target_path):
Path(target_path).write_bytes(b"password removed")
fake_pdf.save.side_effect = save_side_effect
mock_open.return_value.__enter__.return_value = fake_pdf
mock_chord.return_value.delay.return_value = None
result = bulk_edit.remove_password(
[doc.id],
password="secret",
include_metadata=False,
update_document=False,
delete_original=True,
)
self.assertEqual(result, "OK")
mock_open.assert_called_once_with(doc.source_path, password="secret")
mock_consume_file.assert_called_once()
mock_group.assert_not_called()
mock_chord.assert_called_once()
mock_chord.return_value.delay.assert_called_once()
mock_delete.si.assert_called_once_with([doc.id])
@mock.patch("pikepdf.open")
def test_remove_password_open_failure(self, mock_open):
mock_open.side_effect = RuntimeError("wrong password")
with self.assertLogs("paperless.bulk_edit", level="ERROR") as cm:
with self.assertRaises(ValueError) as exc:
bulk_edit.remove_password([self.doc1.id], password="secret")
self.assertIn("wrong password", str(exc.exception))
self.assertIn("Error removing password from document", cm.output[0])

View File

@@ -1,6 +1,7 @@
from datetime import datetime
from unittest import mock
from django.conf import settings
from django.contrib.auth.models import User
from django.test import SimpleTestCase
from django.test import TestCase
@@ -251,3 +252,120 @@ class TestRewriteNaturalDateKeywords(SimpleTestCase):
result = self._rewrite_with_now("added:today", fixed_now)
# Should convert to UTC properly
self.assertIn("added:[20250719", result)
class TestIndexResilience(DirectoriesMixin, SimpleTestCase):
def _assert_recreate_called(self, mock_create_in):
mock_create_in.assert_called_once()
path_arg, schema_arg = mock_create_in.call_args.args
self.assertEqual(path_arg, settings.INDEX_DIR)
self.assertEqual(schema_arg.__class__.__name__, "Schema")
def test_transient_missing_segment_does_not_force_recreate(self):
"""
GIVEN:
- Index directory exists
WHEN:
- open_index is called
- Opening the index raises FileNotFoundError once due to a
transient missing segment
THEN:
- Index is opened successfully on retry
- Index is not recreated
"""
file_marker = settings.INDEX_DIR / "file_marker.txt"
file_marker.write_text("keep")
expected_index = object()
with (
mock.patch("documents.index.exists_in", return_value=True),
mock.patch(
"documents.index.open_dir",
side_effect=[FileNotFoundError("missing"), expected_index],
) as mock_open_dir,
mock.patch(
"documents.index.create_in",
) as mock_create_in,
mock.patch(
"documents.index.rmtree",
) as mock_rmtree,
):
ix = index.open_index()
self.assertIs(ix, expected_index)
self.assertGreaterEqual(mock_open_dir.call_count, 2)
mock_rmtree.assert_not_called()
mock_create_in.assert_not_called()
self.assertEqual(file_marker.read_text(), "keep")
def test_transient_errors_exhaust_retries_and_recreate(self):
"""
GIVEN:
- Index directory exists
WHEN:
- open_index is called
- Opening the index raises FileNotFoundError multiple times due to
transient missing segments
THEN:
- Index is recreated after retries are exhausted
"""
recreated_index = object()
with (
self.assertLogs("paperless.index", level="ERROR") as cm,
mock.patch("documents.index.exists_in", return_value=True),
mock.patch(
"documents.index.open_dir",
side_effect=FileNotFoundError("missing"),
) as mock_open_dir,
mock.patch("documents.index.rmtree") as mock_rmtree,
mock.patch(
"documents.index.create_in",
return_value=recreated_index,
) as mock_create_in,
):
ix = index.open_index()
self.assertIs(ix, recreated_index)
self.assertEqual(mock_open_dir.call_count, 4)
mock_rmtree.assert_called_once_with(settings.INDEX_DIR)
self._assert_recreate_called(mock_create_in)
self.assertIn(
"Error while opening the index after retries, recreating.",
cm.output[0],
)
def test_non_transient_error_recreates_index(self):
"""
GIVEN:
- Index directory exists
WHEN:
- open_index is called
- Opening the index raises a "non-transient" error
THEN:
- Index is recreated
"""
recreated_index = object()
with (
self.assertLogs("paperless.index", level="ERROR") as cm,
mock.patch("documents.index.exists_in", return_value=True),
mock.patch(
"documents.index.open_dir",
side_effect=RuntimeError("boom"),
),
mock.patch("documents.index.rmtree") as mock_rmtree,
mock.patch(
"documents.index.create_in",
return_value=recreated_index,
) as mock_create_in,
):
ix = index.open_index()
self.assertIs(ix, recreated_index)
mock_rmtree.assert_called_once_with(settings.INDEX_DIR)
self._assert_recreate_called(mock_create_in)
self.assertIn(
"Error while opening the index, recreating.",
cm.output[0],
)

View File

@@ -2094,6 +2094,68 @@ class TestWorkflows(
doc.refresh_from_db()
self.assertIsNone(doc.owner)
def test_workflow_scheduled_recurring_respects_latest_run(self):
"""
GIVEN:
- Scheduled workflow marked as recurring with a 1-day interval
- Document that matches the trigger
- Two prior runs exist: one 2 days ago and one 1 hour ago
WHEN:
- Scheduled workflows are checked again
THEN:
- Workflow does not run because the most recent run is inside the interval
"""
trigger = WorkflowTrigger.objects.create(
type=WorkflowTrigger.WorkflowTriggerType.SCHEDULED,
schedule_date_field=WorkflowTrigger.ScheduleDateField.CREATED,
schedule_is_recurring=True,
schedule_recurring_interval_days=1,
)
action = WorkflowAction.objects.create(
assign_title="Doc assign owner",
assign_owner=self.user2,
)
w = Workflow.objects.create(
name="Workflow 1",
order=0,
)
w.triggers.add(trigger)
w.actions.add(action)
w.save()
doc = Document.objects.create(
title="sample test",
correspondent=self.c,
original_filename="sample.pdf",
created=timezone.now().date() - timedelta(days=3),
)
WorkflowRun.objects.create(
workflow=w,
document=doc,
type=WorkflowTrigger.WorkflowTriggerType.SCHEDULED,
run_at=timezone.now() - timedelta(days=2),
)
WorkflowRun.objects.create(
workflow=w,
document=doc,
type=WorkflowTrigger.WorkflowTriggerType.SCHEDULED,
run_at=timezone.now() - timedelta(hours=1),
)
tasks.check_scheduled_workflows()
doc.refresh_from_db()
self.assertIsNone(doc.owner)
self.assertEqual(
WorkflowRun.objects.filter(
workflow=w,
document=doc,
type=WorkflowTrigger.WorkflowTriggerType.SCHEDULED,
).count(),
2,
)
def test_workflow_scheduled_trigger_negative_offset_customfield(self):
"""
GIVEN:

View File

@@ -1631,6 +1631,7 @@ class BulkEditView(PassUserMixin):
"merge": None,
"edit_pdf": "checksum",
"reprocess": "checksum",
"remove_password": "checksum",
}
permission_classes = (IsAuthenticated,)
@@ -1649,6 +1650,7 @@ class BulkEditView(PassUserMixin):
bulk_edit.split,
bulk_edit.merge,
bulk_edit.edit_pdf,
bulk_edit.remove_password,
]:
parameters["user"] = user
@@ -1677,6 +1679,7 @@ class BulkEditView(PassUserMixin):
bulk_edit.rotate,
bulk_edit.delete_pages,
bulk_edit.edit_pdf,
bulk_edit.remove_password,
]
)
or (
@@ -1693,7 +1696,7 @@ class BulkEditView(PassUserMixin):
and (
method in [bulk_edit.split, bulk_edit.merge]
or (
method == bulk_edit.edit_pdf
method in [bulk_edit.edit_pdf, bulk_edit.remove_password]
and not parameters["update_document"]
)
)

View File

@@ -2,7 +2,7 @@ msgid ""
msgstr ""
"Project-Id-Version: paperless-ngx\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2026-01-06 17:11+0000\n"
"POT-Creation-Date: 2026-01-08 21:50+0000\n"
"PO-Revision-Date: 2022-02-17 04:17\n"
"Last-Translator: \n"
"Language-Team: English\n"
@@ -1223,31 +1223,31 @@ msgstr ""
msgid "Invalid color."
msgstr ""
#: documents/serialisers.py:1835
#: documents/serialisers.py:1846
#, python-format
msgid "File type %(type)s not supported"
msgstr ""
#: documents/serialisers.py:1879
#: documents/serialisers.py:1890
#, python-format
msgid "Custom field id must be an integer: %(id)s"
msgstr ""
#: documents/serialisers.py:1886
#: documents/serialisers.py:1897
#, python-format
msgid "Custom field with id %(id)s does not exist"
msgstr ""
#: documents/serialisers.py:1903 documents/serialisers.py:1913
#: documents/serialisers.py:1914 documents/serialisers.py:1924
msgid ""
"Custom fields must be a list of integers or an object mapping ids to values."
msgstr ""
#: documents/serialisers.py:1908
#: documents/serialisers.py:1919
msgid "Some custom fields don't exist or were specified twice."
msgstr ""
#: documents/serialisers.py:2023
#: documents/serialisers.py:2034
msgid "Invalid variable detected."
msgstr ""
@@ -1702,151 +1702,151 @@ msgstr ""
msgid "paperless application settings"
msgstr ""
#: paperless/settings.py:767
#: paperless/settings.py:768
msgid "English (US)"
msgstr ""
#: paperless/settings.py:768
#: paperless/settings.py:769
msgid "Arabic"
msgstr ""
#: paperless/settings.py:769
#: paperless/settings.py:770
msgid "Afrikaans"
msgstr ""
#: paperless/settings.py:770
#: paperless/settings.py:771
msgid "Belarusian"
msgstr ""
#: paperless/settings.py:771
#: paperless/settings.py:772
msgid "Bulgarian"
msgstr ""
#: paperless/settings.py:772
#: paperless/settings.py:773
msgid "Catalan"
msgstr ""
#: paperless/settings.py:773
#: paperless/settings.py:774
msgid "Czech"
msgstr ""
#: paperless/settings.py:774
#: paperless/settings.py:775
msgid "Danish"
msgstr ""
#: paperless/settings.py:775
#: paperless/settings.py:776
msgid "German"
msgstr ""
#: paperless/settings.py:776
#: paperless/settings.py:777
msgid "Greek"
msgstr ""
#: paperless/settings.py:777
#: paperless/settings.py:778
msgid "English (GB)"
msgstr ""
#: paperless/settings.py:778
#: paperless/settings.py:779
msgid "Spanish"
msgstr ""
#: paperless/settings.py:779
#: paperless/settings.py:780
msgid "Persian"
msgstr ""
#: paperless/settings.py:780
#: paperless/settings.py:781
msgid "Finnish"
msgstr ""
#: paperless/settings.py:781
#: paperless/settings.py:782
msgid "French"
msgstr ""
#: paperless/settings.py:782
#: paperless/settings.py:783
msgid "Hungarian"
msgstr ""
#: paperless/settings.py:783
#: paperless/settings.py:784
msgid "Indonesian"
msgstr ""
#: paperless/settings.py:784
#: paperless/settings.py:785
msgid "Italian"
msgstr ""
#: paperless/settings.py:785
#: paperless/settings.py:786
msgid "Japanese"
msgstr ""
#: paperless/settings.py:786
#: paperless/settings.py:787
msgid "Korean"
msgstr ""
#: paperless/settings.py:787
#: paperless/settings.py:788
msgid "Luxembourgish"
msgstr ""
#: paperless/settings.py:788
#: paperless/settings.py:789
msgid "Norwegian"
msgstr ""
#: paperless/settings.py:789
#: paperless/settings.py:790
msgid "Dutch"
msgstr ""
#: paperless/settings.py:790
#: paperless/settings.py:791
msgid "Polish"
msgstr ""
#: paperless/settings.py:791
#: paperless/settings.py:792
msgid "Portuguese (Brazil)"
msgstr ""
#: paperless/settings.py:792
#: paperless/settings.py:793
msgid "Portuguese"
msgstr ""
#: paperless/settings.py:793
#: paperless/settings.py:794
msgid "Romanian"
msgstr ""
#: paperless/settings.py:794
#: paperless/settings.py:795
msgid "Russian"
msgstr ""
#: paperless/settings.py:795
#: paperless/settings.py:796
msgid "Slovak"
msgstr ""
#: paperless/settings.py:796
#: paperless/settings.py:797
msgid "Slovenian"
msgstr ""
#: paperless/settings.py:797
#: paperless/settings.py:798
msgid "Serbian"
msgstr ""
#: paperless/settings.py:798
#: paperless/settings.py:799
msgid "Swedish"
msgstr ""
#: paperless/settings.py:799
#: paperless/settings.py:800
msgid "Turkish"
msgstr ""
#: paperless/settings.py:800
#: paperless/settings.py:801
msgid "Ukrainian"
msgstr ""
#: paperless/settings.py:801
#: paperless/settings.py:802
msgid "Vietnamese"
msgstr ""
#: paperless/settings.py:802
#: paperless/settings.py:803
msgid "Chinese Simplified"
msgstr ""
#: paperless/settings.py:803
#: paperless/settings.py:804
msgid "Chinese Traditional"
msgstr ""

View File

@@ -334,6 +334,7 @@ INSTALLED_APPS = [
"paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"paperless_mail.apps.PaperlessMailConfig",
"paperless_remote.apps.PaperlessRemoteParserConfig",
"django.contrib.admin",
"rest_framework",
"rest_framework.authtoken",
@@ -1430,6 +1431,13 @@ WEBHOOKS_ALLOW_INTERNAL_REQUESTS = __get_boolean(
"true",
)
###############################################################################
# Remote Parser #
###############################################################################
REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")
################################################################################
# AI Settings #
################################################################################

View File

@@ -0,0 +1,4 @@
# this is here so that django finds the checks.
from paperless_remote.checks import check_remote_parser_configured
__all__ = ["check_remote_parser_configured"]

View File

@@ -0,0 +1,14 @@
from django.apps import AppConfig
from paperless_remote.signals import remote_consumer_declaration
class PaperlessRemoteParserConfig(AppConfig):
name = "paperless_remote"
def ready(self):
from documents.signals import document_consumer_declaration
document_consumer_declaration.connect(remote_consumer_declaration)
AppConfig.ready(self)

View File

@@ -0,0 +1,17 @@
from django.conf import settings
from django.core.checks import Error
from django.core.checks import register
@register()
def check_remote_parser_configured(app_configs, **kwargs):
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
):
return [
Error(
"Azure AI remote parser requires endpoint and API key to be configured.",
),
]
return []

View File

@@ -0,0 +1,118 @@
from pathlib import Path
from django.conf import settings
from paperless_tesseract.parsers import RasterisedDocumentParser
class RemoteEngineConfig:
def __init__(
self,
engine: str,
api_key: str | None = None,
endpoint: str | None = None,
):
self.engine = engine
self.api_key = api_key
self.endpoint = endpoint
def engine_is_valid(self):
valid = self.engine in ["azureai"] and self.api_key is not None
if self.engine == "azureai":
valid = valid and self.endpoint is not None
return valid
class RemoteDocumentParser(RasterisedDocumentParser):
"""
This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
as this is the only service that provides a remote OCR API with text-embedded PDF output.
"""
logging_name = "paperless.parsing.remote"
def get_settings(self) -> RemoteEngineConfig:
"""
Returns the configuration for the remote OCR engine, loaded from Django settings.
"""
return RemoteEngineConfig(
engine=settings.REMOTE_OCR_ENGINE,
api_key=settings.REMOTE_OCR_API_KEY,
endpoint=settings.REMOTE_OCR_ENDPOINT,
)
def supported_mime_types(self):
if self.settings.engine_is_valid():
return {
"application/pdf": ".pdf",
"image/png": ".png",
"image/jpeg": ".jpg",
"image/tiff": ".tiff",
"image/bmp": ".bmp",
"image/gif": ".gif",
"image/webp": ".webp",
}
else:
return {}
def azure_ai_vision_parse(
self,
file: Path,
) -> str | None:
"""
Uses Azure AI Vision to parse the document and return the text content.
It requests a searchable PDF output with embedded text.
The PDF is saved to the archive_path attribute.
Returns the text content extracted from the document.
If the parsing fails, it returns None.
"""
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from azure.ai.documentintelligence.models import AnalyzeOutputOption
from azure.ai.documentintelligence.models import DocumentContentFormat
from azure.core.credentials import AzureKeyCredential
client = DocumentIntelligenceClient(
endpoint=self.settings.endpoint,
credential=AzureKeyCredential(self.settings.api_key),
)
try:
with file.open("rb") as f:
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
poller = client.begin_analyze_document(
model_id="prebuilt-read",
body=analyze_request,
output_content_format=DocumentContentFormat.TEXT,
output=[AnalyzeOutputOption.PDF], # request searchable PDF output
content_type="application/json",
)
poller.wait()
result_id = poller.details["operation_id"]
result = poller.result()
# Download the PDF with embedded text
self.archive_path = self.tempdir / "archive.pdf"
with self.archive_path.open("wb") as f:
for chunk in client.get_analyze_result_pdf(
model_id="prebuilt-read",
result_id=result_id,
):
f.write(chunk)
return result.content
except Exception as e:
self.log.error(f"Azure AI Vision parsing failed: {e}")
finally:
client.close()
return None
def parse(self, document_path: Path, mime_type, file_name=None):
if not self.settings.engine_is_valid():
self.log.warning(
"No valid remote parser engine is configured, content will be empty.",
)
self.text = ""
elif self.settings.engine == "azureai":
self.text = self.azure_ai_vision_parse(document_path)

View File

@@ -0,0 +1,18 @@
def get_parser(*args, **kwargs):
from paperless_remote.parsers import RemoteDocumentParser
return RemoteDocumentParser(*args, **kwargs)
def get_supported_mime_types():
from paperless_remote.parsers import RemoteDocumentParser
return RemoteDocumentParser(None).supported_mime_types()
def remote_consumer_declaration(sender, **kwargs):
return {
"parser": get_parser,
"weight": 5,
"mime_types": get_supported_mime_types(),
}

View File

Binary file not shown.

View File

@@ -0,0 +1,24 @@
from unittest import TestCase
from django.test import override_settings
from paperless_remote import check_remote_parser_configured
class TestChecks(TestCase):
@override_settings(REMOTE_OCR_ENGINE=None)
def test_no_engine(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 0)
@override_settings(REMOTE_OCR_ENGINE="azureai")
@override_settings(REMOTE_OCR_API_KEY="somekey")
@override_settings(REMOTE_OCR_ENDPOINT=None)
def test_azure_no_endpoint(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 1)
self.assertTrue(
msgs[0].msg.startswith(
"Azure AI remote parser requires endpoint and API key to be configured.",
),
)

View File

@@ -0,0 +1,128 @@
import uuid
from pathlib import Path
from unittest import mock
from django.test import TestCase
from django.test import override_settings
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless_remote.parsers import RemoteDocumentParser
from paperless_remote.signals import get_parser
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
def assertContainsStrings(self, content: str, strings: list[str]):
# Asserts that all strings appear in content, in the given order.
indices = []
for s in strings:
if s in content:
indices.append(content.index(s))
else:
self.fail(f"'{s}' is not in '{content}'")
self.assertListEqual(indices, sorted(indices))
@mock.patch("paperless_tesseract.parsers.run_subprocess")
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
def test_get_text_with_azure(self, mock_client_cls, mock_subprocess):
# Arrange mock Azure client
mock_client = mock.Mock()
mock_client_cls.return_value = mock_client
# Simulate poller result and its `.details`
mock_poller = mock.Mock()
mock_poller.wait.return_value = None
mock_poller.details = {"operation_id": "fake-op-id"}
mock_client.begin_analyze_document.return_value = mock_poller
mock_poller.result.return_value.content = "This is a test document."
# Return dummy PDF bytes
mock_client.get_analyze_result_pdf.return_value = [
b"%PDF-",
b"1.7 ",
b"FAKEPDF",
]
# Simulate pdftotext by writing dummy text to sidecar file
def fake_run(cmd, *args, **kwargs):
with Path(cmd[-1]).open("w", encoding="utf-8") as f:
f.write("This is a test document.")
mock_subprocess.side_effect = fake_run
with override_settings(
REMOTE_OCR_ENGINE="azureai",
REMOTE_OCR_API_KEY="somekey",
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
):
parser = get_parser(uuid.uuid4())
parser.parse(
self.SAMPLE_FILES / "simple-digital.pdf",
"application/pdf",
)
self.assertContainsStrings(
parser.text.strip(),
["This is a test document."],
)
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
def test_get_text_with_azure_error_logged_and_returns_none(self, mock_client_cls):
mock_client = mock.Mock()
mock_client.begin_analyze_document.side_effect = RuntimeError("fail")
mock_client_cls.return_value = mock_client
with override_settings(
REMOTE_OCR_ENGINE="azureai",
REMOTE_OCR_API_KEY="somekey",
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
):
parser = get_parser(uuid.uuid4())
with mock.patch.object(parser.log, "error") as mock_log_error:
parser.parse(
self.SAMPLE_FILES / "simple-digital.pdf",
"application/pdf",
)
self.assertIsNone(parser.text)
mock_client.begin_analyze_document.assert_called_once()
mock_client.close.assert_called_once()
mock_log_error.assert_called_once()
self.assertIn(
"Azure AI Vision parsing failed",
mock_log_error.call_args[0][0],
)
@override_settings(
REMOTE_OCR_ENGINE="azureai",
REMOTE_OCR_API_KEY="key",
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
)
def test_supported_mime_types_valid_config(self):
parser = RemoteDocumentParser(uuid.uuid4())
expected_types = {
"application/pdf": ".pdf",
"image/png": ".png",
"image/jpeg": ".jpg",
"image/tiff": ".tiff",
"image/bmp": ".bmp",
"image/gif": ".gif",
"image/webp": ".webp",
}
self.assertEqual(parser.supported_mime_types(), expected_types)
def test_supported_mime_types_invalid_config(self):
parser = get_parser(uuid.uuid4())
self.assertEqual(parser.supported_mime_types(), {})
@override_settings(
REMOTE_OCR_ENGINE=None,
REMOTE_OCR_API_KEY=None,
REMOTE_OCR_ENDPOINT=None,
)
def test_parse_with_invalid_config(self):
parser = get_parser(uuid.uuid4())
parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf")
self.assertEqual(parser.text, "")