diff --git a/docs/api.md b/docs/api.md index f7e12bf67..1ac634162 100644 --- a/docs/api.md +++ b/docs/api.md @@ -294,6 +294,13 @@ The following methods are supported: - `"delete_original": true` to delete the original documents after editing. - `"update_document": true` to update the existing document with the edited PDF. - `"include_metadata": true` to copy metadata from the original document to the edited document. +- `remove_password` + - Requires `parameters`: + - `"password": "PASSWORD_STRING"` The password to remove from the PDF documents. + - Optional `parameters`: + - `"update_document": true` to replace the existing document with the password-less PDF. + - `"delete_original": true` to delete the original document after editing. + - `"include_metadata": true` to copy metadata from the original document to the new password-less document. - `merge` - No additional `parameters` required. - The ordering of the merged document is determined by the list of IDs. diff --git a/docs/configuration.md b/docs/configuration.md index f7e9cd7c6..b46a98faf 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1805,6 +1805,26 @@ password. All of these options come from their similarly-named [Django settings] : Defaults to false. +## Remote OCR + +#### [`PAPERLESS_REMOTE_OCR_ENGINE=`](#PAPERLESS_REMOTE_OCR_ENGINE) {#PAPERLESS_REMOTE_OCR_ENGINE} + +: The remote OCR engine to use. Currently only Azure AI is supported as "azureai". + + Defaults to None, which disables remote OCR. + +#### [`PAPERLESS_REMOTE_OCR_API_KEY=`](#PAPERLESS_REMOTE_OCR_API_KEY) {#PAPERLESS_REMOTE_OCR_API_KEY} + +: The API key to use for the remote OCR engine. + + Defaults to None. + +#### [`PAPERLESS_REMOTE_OCR_ENDPOINT=`](#PAPERLESS_REMOTE_OCR_ENDPOINT) {#PAPERLESS_REMOTE_OCR_ENDPOINT} + +: The endpoint to use for the remote OCR engine. This is required for Azure AI. + + Defaults to None. + ## AI {#ai} #### [`PAPERLESS_AI_ENABLED=`](#PAPERLESS_AI_ENABLED) {#PAPERLESS_AI_ENABLED} diff --git a/docs/index.md b/docs/index.md index fd1473b71..1d72f8f6c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -27,7 +27,8 @@ physical documents into a searchable online archive so you can keep, well, _less - **Organize and index** your scanned documents with tags, correspondents, types, and more. - _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way, unless you explicitly choose to do so. - Performs **OCR** on your documents, adding searchable and selectable text, even to documents scanned with only images. -- Utilizes the open-source Tesseract engine to recognize more than 100 languages. + - Utilizes the open-source Tesseract engine to recognize more than 100 languages. + - _New!_ Supports remote OCR with Azure AI (opt-in). - Documents are saved as PDF/A format which is designed for long term storage, alongside the unaltered originals. - Uses machine-learning to automatically add tags, correspondents and document types to your documents. - **New**: Paperless-ngx can now leverage AI (Large Language Models or LLMs) for document suggestions. This is an optional feature that can be enabled (and is disabled by default). diff --git a/docs/usage.md b/docs/usage.md index 69223641c..f5c99aeaf 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -923,6 +923,21 @@ how regularly you intend to scan documents and use paperless. performed the task associated with the document, move it to the inbox. +## Remote OCR + +!!! important + + This feature is disabled by default and will always remain strictly "opt-in". + +Paperless-ngx supports performing OCR on documents using remote services. At the moment, this is limited to +[Microsoft's Azure "Document Intelligence" service](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence). +This is of course a paid service (with a free tier) which requires an Azure account and subscription. Azure AI is not affiliated with +Paperless-ngx in any way. When enabled, Paperless-ngx will automatically send appropriate documents to Azure for OCR processing, bypassing +the local OCR engine. See the [configuration](configuration.md#PAPERLESS_REMOTE_OCR_ENGINE) options for more details. + +Additionally, when using a commercial service with this feature, consider both potential costs as well as any associated file size +or page limitations (e.g. with a free tier). + ## Architecture Paperless-ngx consists of the following components: diff --git a/pyproject.toml b/pyproject.toml index 4598ebcee..2ba8325b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ classifiers = [ # This will allow testing to not install a webserver, mysql, etc dependencies = [ + "azure-ai-documentintelligence>=1.0.2", "babel>=2.17", "bleach~=6.3.0", "celery[redis]~=5.5.1", @@ -262,6 +263,7 @@ testpaths = [ "src/paperless_tesseract/tests/", "src/paperless_tika/tests", "src/paperless_text/tests/", + "src/paperless_remote/tests/", "src/paperless_ai/tests", ] addopts = [ diff --git a/src-ui/messages.xlf b/src-ui/messages.xlf index bd1b943b3..a0692e794 100644 --- a/src-ui/messages.xlf +++ b/src-ui/messages.xlf @@ -328,23 +328,23 @@ src/app/components/manage/custom-fields/custom-fields.component.html - 61 + 70 src/app/components/manage/management-list/management-list.component.html - 139 + 151 src/app/components/manage/management-list/management-list.component.html - 139 + 151 src/app/components/manage/management-list/management-list.component.html - 139 + 151 src/app/components/manage/management-list/management-list.component.html - 139 + 151 @@ -385,7 +385,7 @@ src/app/components/document-detail/document-detail.component.html - 113 + 119 @@ -534,7 +534,7 @@ src/app/components/document-detail/document-detail.component.html - 374 + 380 @@ -593,7 +593,7 @@ src/app/components/document-detail/document-detail.component.html - 367 + 373 src/app/components/document-list/bulk-editor/custom-fields-bulk-edit-dialog/custom-fields-bulk-edit-dialog.component.html @@ -761,7 +761,7 @@ src/app/components/document-detail/document-detail.component.html - 387 + 393 src/app/components/document-list/document-list.component.html @@ -1234,7 +1234,7 @@ src/app/components/document-detail/document-detail.component.html - 343 + 349 src/app/components/document-list/bulk-editor/bulk-editor.component.html @@ -2164,7 +2164,7 @@ src/app/components/manage/custom-fields/custom-fields.component.html - 55 + 61 src/app/components/manage/mail/mail.component.html @@ -2216,19 +2216,19 @@ src/app/components/manage/management-list/management-list.component.html - 133 + 140 src/app/components/manage/management-list/management-list.component.html - 133 + 140 src/app/components/manage/management-list/management-list.component.html - 133 + 140 src/app/components/manage/management-list/management-list.component.html - 133 + 140 src/app/components/manage/management-list/management-list.component.ts @@ -2300,7 +2300,7 @@ src/app/components/manage/custom-fields/custom-fields.component.ts - 104 + 106 src/app/components/manage/mail/mail.component.ts @@ -2483,7 +2483,7 @@ src/app/components/manage/custom-fields/custom-fields.component.html - 52 + 58 src/app/components/manage/mail/mail.component.html @@ -2519,19 +2519,19 @@ src/app/components/manage/management-list/management-list.component.html - 130 + 137 src/app/components/manage/management-list/management-list.component.html - 130 + 137 src/app/components/manage/management-list/management-list.component.html - 130 + 137 src/app/components/manage/management-list/management-list.component.html - 130 + 137 src/app/components/manage/workflows/workflows.component.html @@ -2607,11 +2607,11 @@ src/app/components/document-detail/document-detail.component.ts - 1028 + 1030 src/app/components/document-detail/document-detail.component.ts - 1393 + 1395 src/app/components/document-list/bulk-editor/bulk-editor.component.ts @@ -2627,7 +2627,7 @@ src/app/components/manage/custom-fields/custom-fields.component.ts - 106 + 108 src/app/components/manage/mail/mail.component.ts @@ -3223,7 +3223,7 @@ src/app/components/document-detail/document-detail.component.ts - 981 + 983 src/app/components/document-list/bulk-editor/bulk-editor.component.ts @@ -3292,6 +3292,67 @@ 39 + + Replace current document + + src/app/components/common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component.html + 22 + + + + Create new document + + src/app/components/common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component.html + 35 + + + + Copy metadata + + src/app/components/common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component.html + 43,44 + + + + Delete original + + src/app/components/common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component.html + 48 + + + + Remove password protection + + src/app/components/common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component.ts + 18 + + + src/app/components/document-detail/document-detail.component.ts + 1446 + + + + Create an unprotected copy or replace the existing file. + + src/app/components/common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component.ts + 22 + + + src/app/components/document-detail/document-detail.component.ts + 1447 + + + + Start + + src/app/components/common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component.ts + 25 + + + src/app/components/document-detail/document-detail.component.ts + 1448 + + Note that only PDFs will be rotated. @@ -3340,7 +3401,7 @@ src/app/components/manage/custom-fields/custom-fields.component.ts - 85 + 87 @@ -3351,7 +3412,7 @@ src/app/components/manage/custom-fields/custom-fields.component.ts - 94 + 96 @@ -3413,7 +3474,7 @@ src/app/components/document-detail/document-detail.component.html - 107 + 113 src/app/guards/dirty-saved-view.guard.ts @@ -4361,7 +4422,7 @@ src/app/components/manage/storage-path-list/storage-path-list.component.ts - 49 + 51 @@ -4379,7 +4440,7 @@ src/app/components/document-detail/document-detail.component.html - 309 + 315 @@ -4436,7 +4497,7 @@ src/app/components/manage/tag-list/tag-list.component.ts - 49 + 51 @@ -4490,7 +4551,7 @@ src/app/components/document-detail/document-detail.component.html - 92 + 98 src/app/components/document-list/bulk-editor/bulk-editor.component.html @@ -6210,7 +6271,7 @@ src/app/components/document-detail/document-detail.component.html - 88 + 94 @@ -6806,35 +6867,42 @@ src/app/components/document-detail/document-detail.component.ts - 1392 + 1394 + + + + Remove Password + + src/app/components/document-detail/document-detail.component.html + 71 Send src/app/components/document-detail/document-detail.component.html - 84 + 90 Previous src/app/components/document-detail/document-detail.component.html - 110 + 116 Details src/app/components/document-detail/document-detail.component.html - 123 + 129 Title src/app/components/document-detail/document-detail.component.html - 126 + 132 src/app/components/document-list/document-list.component.html @@ -6857,21 +6925,21 @@ Archive serial number src/app/components/document-detail/document-detail.component.html - 127 + 133 Date created src/app/components/document-detail/document-detail.component.html - 128 + 134 Correspondent src/app/components/document-detail/document-detail.component.html - 130 + 136 src/app/components/document-list/bulk-editor/bulk-editor.component.html @@ -6898,7 +6966,7 @@ Document type src/app/components/document-detail/document-detail.component.html - 132 + 138 src/app/components/document-list/bulk-editor/bulk-editor.component.html @@ -6925,7 +6993,7 @@ Storage path src/app/components/document-detail/document-detail.component.html - 134 + 140 src/app/components/document-list/bulk-editor/bulk-editor.component.html @@ -6948,7 +7016,7 @@ Default src/app/components/document-detail/document-detail.component.html - 135 + 141 src/app/components/manage/saved-views/saved-views.component.html @@ -6959,14 +7027,14 @@ Content src/app/components/document-detail/document-detail.component.html - 239 + 245 Metadata src/app/components/document-detail/document-detail.component.html - 248 + 254 src/app/components/document-detail/metadata-collapse/metadata-collapse.component.ts @@ -6977,175 +7045,175 @@ Date modified src/app/components/document-detail/document-detail.component.html - 255 + 261 Date added src/app/components/document-detail/document-detail.component.html - 259 + 265 Media filename src/app/components/document-detail/document-detail.component.html - 263 + 269 Original filename src/app/components/document-detail/document-detail.component.html - 267 + 273 Original MD5 checksum src/app/components/document-detail/document-detail.component.html - 271 + 277 Original file size src/app/components/document-detail/document-detail.component.html - 275 + 281 Original mime type src/app/components/document-detail/document-detail.component.html - 279 + 285 Archive MD5 checksum src/app/components/document-detail/document-detail.component.html - 284 + 290 Archive file size src/app/components/document-detail/document-detail.component.html - 290 + 296 Original document metadata src/app/components/document-detail/document-detail.component.html - 299 + 305 Archived document metadata src/app/components/document-detail/document-detail.component.html - 302 + 308 Notes src/app/components/document-detail/document-detail.component.html - 321,324 + 327,330 History src/app/components/document-detail/document-detail.component.html - 332 + 338 Save & next src/app/components/document-detail/document-detail.component.html - 369 + 375 Save & close src/app/components/document-detail/document-detail.component.html - 372 + 378 Document loading... src/app/components/document-detail/document-detail.component.html - 382 + 388 Enter Password src/app/components/document-detail/document-detail.component.html - 436 + 442 An error occurred loading content: src/app/components/document-detail/document-detail.component.ts - 416,418 + 418,420 Document changes detected src/app/components/document-detail/document-detail.component.ts - 450 + 452 The version of this document in your browser session appears older than the existing version. src/app/components/document-detail/document-detail.component.ts - 451 + 453 Saving the document here may overwrite other changes that were made. To restore the existing version, discard your changes or close the document. src/app/components/document-detail/document-detail.component.ts - 452 + 454 Ok src/app/components/document-detail/document-detail.component.ts - 454 + 456 Next document src/app/components/document-detail/document-detail.component.ts - 580 + 582 Previous document src/app/components/document-detail/document-detail.component.ts - 590 + 592 Close document src/app/components/document-detail/document-detail.component.ts - 598 + 600 src/app/services/open-documents.service.ts @@ -7156,67 +7224,67 @@ Save document src/app/components/document-detail/document-detail.component.ts - 605 + 607 Save and close / next src/app/components/document-detail/document-detail.component.ts - 614 + 616 Error retrieving metadata src/app/components/document-detail/document-detail.component.ts - 669 + 671 Error retrieving suggestions. src/app/components/document-detail/document-detail.component.ts - 698 + 700 Document "" saved successfully. src/app/components/document-detail/document-detail.component.ts - 870 + 872 src/app/components/document-detail/document-detail.component.ts - 894 + 896 Error saving document "" src/app/components/document-detail/document-detail.component.ts - 900 + 902 Error saving document src/app/components/document-detail/document-detail.component.ts - 950 + 952 Do you really want to move the document "" to the trash? src/app/components/document-detail/document-detail.component.ts - 982 + 984 Documents can be restored prior to permanent deletion. src/app/components/document-detail/document-detail.component.ts - 983 + 985 src/app/components/document-list/bulk-editor/bulk-editor.component.ts @@ -7227,7 +7295,7 @@ Move to trash src/app/components/document-detail/document-detail.component.ts - 985 + 987 src/app/components/document-list/bulk-editor/bulk-editor.component.ts @@ -7238,14 +7306,14 @@ Error deleting document src/app/components/document-detail/document-detail.component.ts - 1004 + 1006 Reprocess confirm src/app/components/document-detail/document-detail.component.ts - 1024 + 1026 src/app/components/document-list/bulk-editor/bulk-editor.component.ts @@ -7256,81 +7324,102 @@ This operation will permanently recreate the archive file for this document. src/app/components/document-detail/document-detail.component.ts - 1025 + 1027 The archive file will be re-generated with the current settings. src/app/components/document-detail/document-detail.component.ts - 1026 + 1028 Reprocess operation for "" will begin in the background. Close and re-open or reload this document after the operation has completed to see new content. src/app/components/document-detail/document-detail.component.ts - 1036 + 1038 Error executing operation src/app/components/document-detail/document-detail.component.ts - 1047 + 1049 Error downloading document src/app/components/document-detail/document-detail.component.ts - 1096 + 1098 Page Fit src/app/components/document-detail/document-detail.component.ts - 1173 + 1175 PDF edit operation for "" will begin in the background. src/app/components/document-detail/document-detail.component.ts - 1411 + 1413 Error executing PDF edit operation src/app/components/document-detail/document-detail.component.ts - 1423 + 1425 + + + + Please enter the current password before attempting to remove it. + + src/app/components/document-detail/document-detail.component.ts + 1436 + + + + Password removal operation for "" will begin in the background. + + src/app/components/document-detail/document-detail.component.ts + 1468 + + + + Error executing password removal operation + + src/app/components/document-detail/document-detail.component.ts + 1482 Print failed. src/app/components/document-detail/document-detail.component.ts - 1460 + 1519 Error loading document for printing. src/app/components/document-detail/document-detail.component.ts - 1472 + 1531 An error occurred loading tiff: src/app/components/document-detail/document-detail.component.ts - 1537 + 1596 src/app/components/document-detail/document-detail.component.ts - 1541 + 1600 @@ -8500,28 +8589,28 @@ correspondent src/app/components/manage/correspondent-list/correspondent-list.component.ts - 47 + 49 correspondents src/app/components/manage/correspondent-list/correspondent-list.component.ts - 48 + 50 Last used src/app/components/manage/correspondent-list/correspondent-list.component.ts - 53 + 55 Do you really want to delete the correspondent ""? src/app/components/manage/correspondent-list/correspondent-list.component.ts - 78 + 80 @@ -8549,79 +8638,79 @@ Filter Documents () src/app/components/manage/custom-fields/custom-fields.component.html - 45 + 50 src/app/components/manage/management-list/management-list.component.html - 123 + 129 src/app/components/manage/management-list/management-list.component.html - 123 + 129 src/app/components/manage/management-list/management-list.component.html - 123 + 129 src/app/components/manage/management-list/management-list.component.html - 123 + 129 No fields defined. src/app/components/manage/custom-fields/custom-fields.component.html - 70 + 80 Confirm delete field src/app/components/manage/custom-fields/custom-fields.component.ts - 102 + 104 This operation will permanently delete this field. src/app/components/manage/custom-fields/custom-fields.component.ts - 103 + 105 Deleted field "" src/app/components/manage/custom-fields/custom-fields.component.ts - 112 + 114 Error deleting field "". src/app/components/manage/custom-fields/custom-fields.component.ts - 121 + 123 document type src/app/components/manage/document-type-list/document-type-list.component.ts - 43 + 45 document types src/app/components/manage/document-type-list/document-type-list.component.ts - 44 + 46 Do you really want to delete the document type ""? src/app/components/manage/document-type-list/document-type-list.component.ts - 49 + 51 @@ -9161,42 +9250,42 @@ storage path src/app/components/manage/storage-path-list/storage-path-list.component.ts - 43 + 45 storage paths src/app/components/manage/storage-path-list/storage-path-list.component.ts - 44 + 46 Do you really want to delete the storage path ""? src/app/components/manage/storage-path-list/storage-path-list.component.ts - 60 + 62 tag src/app/components/manage/tag-list/tag-list.component.ts - 43 + 45 tags src/app/components/manage/tag-list/tag-list.component.ts - 44 + 46 Do you really want to delete the tag ""? src/app/components/manage/tag-list/tag-list.component.ts - 60 + 62 diff --git a/src-ui/src/app/components/common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component.html b/src-ui/src/app/components/common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component.html new file mode 100644 index 000000000..fc866fe40 --- /dev/null +++ b/src-ui/src/app/components/common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component.html @@ -0,0 +1,75 @@ + + + diff --git a/src-ui/src/app/components/common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component.scss b/src-ui/src/app/components/common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component.scss new file mode 100644 index 000000000..e69de29bb diff --git a/src-ui/src/app/components/common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component.spec.ts b/src-ui/src/app/components/common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component.spec.ts new file mode 100644 index 000000000..a1449511b --- /dev/null +++ b/src-ui/src/app/components/common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component.spec.ts @@ -0,0 +1,53 @@ +import { ComponentFixture, TestBed } from '@angular/core/testing' +import { By } from '@angular/platform-browser' +import { NgbActiveModal } from '@ng-bootstrap/ng-bootstrap' +import { NgxBootstrapIconsModule, allIcons } from 'ngx-bootstrap-icons' +import { PasswordRemovalConfirmDialogComponent } from './password-removal-confirm-dialog.component' + +describe('PasswordRemovalConfirmDialogComponent', () => { + let component: PasswordRemovalConfirmDialogComponent + let fixture: ComponentFixture + + beforeEach(async () => { + await TestBed.configureTestingModule({ + providers: [NgbActiveModal], + imports: [ + NgxBootstrapIconsModule.pick(allIcons), + PasswordRemovalConfirmDialogComponent, + ], + }).compileComponents() + + fixture = TestBed.createComponent(PasswordRemovalConfirmDialogComponent) + component = fixture.componentInstance + fixture.detectChanges() + }) + + it('should default to replacing the document', () => { + expect(component.updateDocument).toBe(true) + expect( + fixture.debugElement.query(By.css('#removeReplace')).nativeElement.checked + ).toBe(true) + }) + + it('should allow creating a new document with metadata and delete toggle', () => { + component.onUpdateDocumentChange(false) + fixture.detectChanges() + + expect(component.updateDocument).toBe(false) + expect(fixture.debugElement.query(By.css('#copyMetaRemove'))).not.toBeNull() + + component.includeMetadata = false + component.deleteOriginal = true + component.onUpdateDocumentChange(true) + expect(component.updateDocument).toBe(true) + expect(component.includeMetadata).toBe(true) + expect(component.deleteOriginal).toBe(false) + }) + + it('should emit confirm when confirmed', () => { + let confirmed = false + component.confirmClicked.subscribe(() => (confirmed = true)) + component.confirm() + expect(confirmed).toBe(true) + }) +}) diff --git a/src-ui/src/app/components/common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component.ts b/src-ui/src/app/components/common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component.ts new file mode 100644 index 000000000..82444ad13 --- /dev/null +++ b/src-ui/src/app/components/common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component.ts @@ -0,0 +1,38 @@ +import { Component, Input } from '@angular/core' +import { FormsModule } from '@angular/forms' +import { NgxBootstrapIconsModule } from 'ngx-bootstrap-icons' +import { ConfirmDialogComponent } from '../confirm-dialog.component' + +@Component({ + selector: 'pngx-password-removal-confirm-dialog', + templateUrl: './password-removal-confirm-dialog.component.html', + styleUrls: ['./password-removal-confirm-dialog.component.scss'], + imports: [FormsModule, NgxBootstrapIconsModule], +}) +export class PasswordRemovalConfirmDialogComponent extends ConfirmDialogComponent { + updateDocument: boolean = true + includeMetadata: boolean = true + deleteOriginal: boolean = false + + @Input() + override title = $localize`Remove password protection` + + @Input() + override message = + $localize`Create an unprotected copy or replace the existing file.` + + @Input() + override btnCaption = $localize`Start` + + constructor() { + super() + } + + onUpdateDocumentChange(updateDocument: boolean) { + this.updateDocument = updateDocument + if (this.updateDocument) { + this.deleteOriginal = false + this.includeMetadata = true + } + } +} diff --git a/src-ui/src/app/components/document-detail/document-detail.component.html b/src-ui/src/app/components/document-detail/document-detail.component.html index bdb0bfa20..44304c942 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.html +++ b/src-ui/src/app/components/document-detail/document-detail.component.html @@ -65,6 +65,12 @@ + + @if (userIsOwner && (requiresPassword || password)) { + + } diff --git a/src-ui/src/app/components/document-detail/document-detail.component.spec.ts b/src-ui/src/app/components/document-detail/document-detail.component.spec.ts index 777cd8544..198e7a7a4 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.spec.ts +++ b/src-ui/src/app/components/document-detail/document-detail.component.spec.ts @@ -66,6 +66,7 @@ import { SettingsService } from 'src/app/services/settings.service' import { ToastService } from 'src/app/services/toast.service' import { environment } from 'src/environments/environment' import { ConfirmDialogComponent } from '../common/confirm-dialog/confirm-dialog.component' +import { PasswordRemovalConfirmDialogComponent } from '../common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component' import { CustomFieldsDropdownComponent } from '../common/custom-fields-dropdown/custom-fields-dropdown.component' import { DocumentDetailComponent, @@ -1264,6 +1265,88 @@ describe('DocumentDetailComponent', () => { expect(closeSpy).toHaveBeenCalled() }) + it('should support removing password protection from pdfs', () => { + let modal: NgbModalRef + modalService.activeInstances.subscribe((m) => (modal = m[0])) + initNormally() + component.password = 'secret' + component.removePassword() + const dialog = + modal.componentInstance as PasswordRemovalConfirmDialogComponent + dialog.updateDocument = false + dialog.includeMetadata = false + dialog.deleteOriginal = true + dialog.confirm() + const req = httpTestingController.expectOne( + `${environment.apiBaseUrl}documents/bulk_edit/` + ) + expect(req.request.body).toEqual({ + documents: [doc.id], + method: 'remove_password', + parameters: { + password: 'secret', + update_document: false, + include_metadata: false, + delete_original: true, + }, + }) + req.flush(true) + }) + + it('should require the current password before removing it', () => { + initNormally() + const errorSpy = jest.spyOn(toastService, 'showError') + component.requiresPassword = true + component.password = '' + + component.removePassword() + + expect(errorSpy).toHaveBeenCalled() + httpTestingController.expectNone( + `${environment.apiBaseUrl}documents/bulk_edit/` + ) + }) + + it('should handle failures when removing password protection', () => { + let modal: NgbModalRef + modalService.activeInstances.subscribe((m) => (modal = m[0])) + initNormally() + const errorSpy = jest.spyOn(toastService, 'showError') + component.password = 'secret' + + component.removePassword() + const dialog = + modal.componentInstance as PasswordRemovalConfirmDialogComponent + dialog.confirm() + const req = httpTestingController.expectOne( + `${environment.apiBaseUrl}documents/bulk_edit/` + ) + req.error(new ErrorEvent('failed')) + + expect(errorSpy).toHaveBeenCalled() + expect(component.networkActive).toBe(false) + expect(dialog.buttonsEnabled).toBe(true) + }) + + it('should refresh the document when removing password in update mode', () => { + let modal: NgbModalRef + modalService.activeInstances.subscribe((m) => (modal = m[0])) + const refreshSpy = jest.spyOn(openDocumentsService, 'refreshDocument') + initNormally() + component.password = 'secret' + + component.removePassword() + const dialog = + modal.componentInstance as PasswordRemovalConfirmDialogComponent + dialog.confirm() + const req = httpTestingController.expectOne( + `${environment.apiBaseUrl}documents/bulk_edit/` + ) + req.flush(true) + + expect(refreshSpy).toHaveBeenCalledWith(doc.id) + }) + it('should support keyboard shortcuts', () => { initNormally() diff --git a/src-ui/src/app/components/document-detail/document-detail.component.ts b/src-ui/src/app/components/document-detail/document-detail.component.ts index df0850cf2..2ebc37a47 100644 --- a/src-ui/src/app/components/document-detail/document-detail.component.ts +++ b/src-ui/src/app/components/document-detail/document-detail.component.ts @@ -85,6 +85,7 @@ import { getFilenameFromContentDisposition } from 'src/app/utils/http' import { ISODateAdapter } from 'src/app/utils/ngb-iso-date-adapter' import * as UTIF from 'utif' import { ConfirmDialogComponent } from '../common/confirm-dialog/confirm-dialog.component' +import { PasswordRemovalConfirmDialogComponent } from '../common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component' import { CustomFieldsDropdownComponent } from '../common/custom-fields-dropdown/custom-fields-dropdown.component' import { CorrespondentEditDialogComponent } from '../common/edit-dialog/correspondent-edit-dialog/correspondent-edit-dialog.component' import { DocumentTypeEditDialogComponent } from '../common/edit-dialog/document-type-edit-dialog/document-type-edit-dialog.component' @@ -180,6 +181,7 @@ export enum ZoomSetting { NgxBootstrapIconsModule, PdfViewerModule, TextAreaComponent, + PasswordRemovalConfirmDialogComponent, ], }) export class DocumentDetailComponent @@ -1497,6 +1499,63 @@ export class DocumentDetailComponent }) } + removePassword() { + if (this.requiresPassword || !this.password) { + this.toastService.showError( + $localize`Please enter the current password before attempting to remove it.` + ) + return + } + const modal = this.modalService.open( + PasswordRemovalConfirmDialogComponent, + { + backdrop: 'static', + } + ) + modal.componentInstance.title = $localize`Remove password protection` + modal.componentInstance.message = $localize`Create an unprotected copy or replace the existing file.` + modal.componentInstance.btnCaption = $localize`Start` + + modal.componentInstance.confirmClicked + .pipe(takeUntil(this.unsubscribeNotifier)) + .subscribe(() => { + const dialog = + modal.componentInstance as PasswordRemovalConfirmDialogComponent + dialog.buttonsEnabled = false + this.networkActive = true + this.documentsService + .bulkEdit([this.document.id], 'remove_password', { + password: this.password, + update_document: dialog.updateDocument, + include_metadata: dialog.includeMetadata, + delete_original: dialog.deleteOriginal, + }) + .pipe(first(), takeUntil(this.unsubscribeNotifier)) + .subscribe({ + next: () => { + this.toastService.showInfo( + $localize`Password removal operation for "${this.document.title}" will begin in the background.` + ) + this.networkActive = false + modal.close() + if (!dialog.updateDocument && dialog.deleteOriginal) { + this.openDocumentService.closeDocument(this.document) + } else if (dialog.updateDocument) { + this.openDocumentService.refreshDocument(this.documentId) + } + }, + error: (error) => { + dialog.buttonsEnabled = true + this.networkActive = false + this.toastService.showError( + $localize`Error executing password removal operation`, + error + ) + }, + }) + }) + } + printDocument() { const printUrl = this.documentsService.getDownloadUrl( this.document.id, diff --git a/src-ui/src/app/components/manage/correspondent-list/correspondent-list.component.ts b/src-ui/src/app/components/manage/correspondent-list/correspondent-list.component.ts index 0131ac992..957371e08 100644 --- a/src-ui/src/app/components/manage/correspondent-list/correspondent-list.component.ts +++ b/src-ui/src/app/components/manage/correspondent-list/correspondent-list.component.ts @@ -1,6 +1,7 @@ import { NgClass, NgTemplateOutlet, TitleCasePipe } from '@angular/common' import { Component, inject } from '@angular/core' import { FormsModule, ReactiveFormsModule } from '@angular/forms' +import { RouterModule } from '@angular/router' import { NgbDropdownModule, NgbPaginationModule, @@ -29,6 +30,7 @@ import { ManagementListComponent } from '../management-list/management-list.comp TitleCasePipe, FormsModule, ReactiveFormsModule, + RouterModule, NgClass, NgTemplateOutlet, NgbDropdownModule, diff --git a/src-ui/src/app/components/manage/custom-fields/custom-fields.component.html b/src-ui/src/app/components/manage/custom-fields/custom-fields.component.html index 185e9da35..0a6d80658 100644 --- a/src-ui/src/app/components/manage/custom-fields/custom-fields.component.html +++ b/src-ui/src/app/components/manage/custom-fields/custom-fields.component.html @@ -42,7 +42,13 @@ @if (field.document_count > 0) { - + Filter Documents ({{ field.document_count }}) } @@ -57,9 +63,13 @@ @if (field.document_count > 0) {
- + +  Documents{{ field.document_count }} +
} diff --git a/src-ui/src/app/components/manage/custom-fields/custom-fields.component.spec.ts b/src-ui/src/app/components/manage/custom-fields/custom-fields.component.spec.ts index e94470d64..b86d476f3 100644 --- a/src-ui/src/app/components/manage/custom-fields/custom-fields.component.spec.ts +++ b/src-ui/src/app/components/manage/custom-fields/custom-fields.component.spec.ts @@ -4,6 +4,7 @@ import { provideHttpClient, withInterceptorsFromDi } from '@angular/common/http' import { provideHttpClientTesting } from '@angular/common/http/testing' import { FormsModule, ReactiveFormsModule } from '@angular/forms' import { By } from '@angular/platform-browser' +import { RouterTestingModule } from '@angular/router/testing' import { NgbModal, NgbModalModule, @@ -61,6 +62,7 @@ describe('CustomFieldsComponent', () => { NgbModalModule, NgbPopoverModule, NgxBootstrapIconsModule.pick(allIcons), + RouterTestingModule, CustomFieldsComponent, IfPermissionsDirective, PageHeaderComponent, @@ -108,7 +110,9 @@ describe('CustomFieldsComponent', () => { const toastInfoSpy = jest.spyOn(toastService, 'showInfo') const reloadSpy = jest.spyOn(component, 'reload') - const createButton = fixture.debugElement.queryAll(By.css('button'))[1] + const createButton = fixture.debugElement + .queryAll(By.css('button')) + .find((btn) => btn.nativeElement.textContent.trim().includes('Add Field')) createButton.triggerEventHandler('click') expect(modal).not.toBeUndefined() @@ -133,7 +137,11 @@ describe('CustomFieldsComponent', () => { const toastInfoSpy = jest.spyOn(toastService, 'showInfo') const reloadSpy = jest.spyOn(component, 'reload') - const editButton = fixture.debugElement.queryAll(By.css('button'))[2] + const editButton = fixture.debugElement + .queryAll(By.css('button')) + .find((btn) => + btn.nativeElement.textContent.trim().includes(fields[0].name) + ) editButton.triggerEventHandler('click') expect(modal).not.toBeUndefined() @@ -158,7 +166,9 @@ describe('CustomFieldsComponent', () => { const deleteSpy = jest.spyOn(customFieldsService, 'delete') const reloadSpy = jest.spyOn(component, 'reload') - const deleteButton = fixture.debugElement.queryAll(By.css('button'))[5] + const deleteButton = fixture.debugElement + .queryAll(By.css('button')) + .find((btn) => btn.nativeElement.textContent.trim().includes('Delete')) deleteButton.triggerEventHandler('click') expect(modal).not.toBeUndefined() @@ -176,10 +186,10 @@ describe('CustomFieldsComponent', () => { expect(reloadSpy).toHaveBeenCalled() }) - it('should support filter documents', () => { - const filterSpy = jest.spyOn(listViewService, 'quickFilter') - component.filterDocuments(fields[0]) - expect(filterSpy).toHaveBeenCalledWith([ + it('should provide document filter url', () => { + const urlSpy = jest.spyOn(listViewService, 'getQuickFilterUrl') + component.getDocumentFilterUrl(fields[0]) + expect(urlSpy).toHaveBeenCalledWith([ { rule_type: FILTER_CUSTOM_FIELDS_QUERY, value: JSON.stringify([ diff --git a/src-ui/src/app/components/manage/custom-fields/custom-fields.component.ts b/src-ui/src/app/components/manage/custom-fields/custom-fields.component.ts index 9e7ecf78a..8ecd713ef 100644 --- a/src-ui/src/app/components/manage/custom-fields/custom-fields.component.ts +++ b/src-ui/src/app/components/manage/custom-fields/custom-fields.component.ts @@ -1,4 +1,5 @@ import { Component, OnInit, inject } from '@angular/core' +import { RouterModule } from '@angular/router' import { NgbDropdownModule, NgbModal, @@ -36,6 +37,7 @@ import { LoadingComponentWithPermissions } from '../../loading-component/loading NgbDropdownModule, NgbPaginationModule, NgxBootstrapIconsModule, + RouterModule, ], }) export class CustomFieldsComponent @@ -130,8 +132,8 @@ export class CustomFieldsComponent return DATA_TYPE_LABELS.find((l) => l.id === field.data_type).name } - filterDocuments(field: CustomField) { - this.documentListViewService.quickFilter([ + getDocumentFilterUrl(field: CustomField) { + return this.documentListViewService.getQuickFilterUrl([ { rule_type: FILTER_CUSTOM_FIELDS_QUERY, value: JSON.stringify([ diff --git a/src-ui/src/app/components/manage/document-type-list/document-type-list.component.ts b/src-ui/src/app/components/manage/document-type-list/document-type-list.component.ts index 21a4779e9..b561af2d1 100644 --- a/src-ui/src/app/components/manage/document-type-list/document-type-list.component.ts +++ b/src-ui/src/app/components/manage/document-type-list/document-type-list.component.ts @@ -1,6 +1,7 @@ import { NgClass, NgTemplateOutlet, TitleCasePipe } from '@angular/common' import { Component, inject } from '@angular/core' import { FormsModule, ReactiveFormsModule } from '@angular/forms' +import { RouterModule } from '@angular/router' import { NgbDropdownModule, NgbPaginationModule, @@ -27,6 +28,7 @@ import { ManagementListComponent } from '../management-list/management-list.comp IfPermissionsDirective, FormsModule, ReactiveFormsModule, + RouterModule, NgClass, NgTemplateOutlet, NgbDropdownModule, diff --git a/src-ui/src/app/components/manage/management-list/management-list.component.html b/src-ui/src/app/components/manage/management-list/management-list.component.html index 8fac6f44f..1cfb3aa0d 100644 --- a/src-ui/src/app/components/manage/management-list/management-list.component.html +++ b/src-ui/src/app/components/manage/management-list/management-list.component.html @@ -120,7 +120,14 @@ @if (getDocumentCount(object) > 0) { - + Filter Documents ({{ getDocumentCount(object) }}) } @@ -135,9 +142,15 @@ @if (getDocumentCount(object) > 0) {
- + +  Documents{{ getDocumentCount(object) }} +
} diff --git a/src-ui/src/app/components/manage/management-list/management-list.component.spec.ts b/src-ui/src/app/components/manage/management-list/management-list.component.spec.ts index 813c81148..a9f7a0626 100644 --- a/src-ui/src/app/components/manage/management-list/management-list.component.spec.ts +++ b/src-ui/src/app/components/manage/management-list/management-list.component.spec.ts @@ -13,6 +13,7 @@ import { } from '@angular/core/testing' import { FormsModule, ReactiveFormsModule } from '@angular/forms' import { By } from '@angular/platform-browser' +import { RouterLinkWithHref } from '@angular/router' import { RouterTestingModule } from '@angular/router/testing' import { NgbModal, @@ -230,12 +231,15 @@ describe('ManagementListComponent', () => { }) it('should support quick filter for objects', () => { - const qfSpy = jest.spyOn(documentListViewService, 'quickFilter') - const filterButton = fixture.debugElement.queryAll(By.css('button'))[9] - filterButton.triggerEventHandler('click') - expect(qfSpy).toHaveBeenCalledWith([ + const expectedUrl = documentListViewService.getQuickFilterUrl([ { rule_type: FILTER_HAS_TAGS_ALL, value: tags[0].id.toString() }, - ]) // subclasses set the filter rule type + ]) + const filterLink = fixture.debugElement.query( + By.css('a.btn-outline-secondary') + ) + expect(filterLink).toBeTruthy() + const routerLink = filterLink.injector.get(RouterLinkWithHref) + expect(routerLink.urlTree).toEqual(expectedUrl) }) it('should reload on sort', () => { diff --git a/src-ui/src/app/components/manage/management-list/management-list.component.ts b/src-ui/src/app/components/manage/management-list/management-list.component.ts index b1af1f1d1..e8e7a3bb3 100644 --- a/src-ui/src/app/components/manage/management-list/management-list.component.ts +++ b/src-ui/src/app/components/manage/management-list/management-list.component.ts @@ -230,8 +230,8 @@ export abstract class ManagementListComponent abstract getDeleteMessage(object: T) - filterDocuments(object: MatchingModel) { - this.documentListViewService.quickFilter([ + getDocumentFilterUrl(object: MatchingModel) { + return this.documentListViewService.getQuickFilterUrl([ { rule_type: this.filterRuleType, value: object.id.toString() }, ]) } diff --git a/src-ui/src/app/components/manage/storage-path-list/storage-path-list.component.ts b/src-ui/src/app/components/manage/storage-path-list/storage-path-list.component.ts index 413ccc33a..cac8637d7 100644 --- a/src-ui/src/app/components/manage/storage-path-list/storage-path-list.component.ts +++ b/src-ui/src/app/components/manage/storage-path-list/storage-path-list.component.ts @@ -1,6 +1,7 @@ import { NgClass, NgTemplateOutlet, TitleCasePipe } from '@angular/common' import { Component, inject } from '@angular/core' import { FormsModule, ReactiveFormsModule } from '@angular/forms' +import { RouterModule } from '@angular/router' import { NgbDropdownModule, NgbPaginationModule, @@ -27,6 +28,7 @@ import { ManagementListComponent } from '../management-list/management-list.comp IfPermissionsDirective, FormsModule, ReactiveFormsModule, + RouterModule, NgClass, NgTemplateOutlet, NgbDropdownModule, diff --git a/src-ui/src/app/components/manage/tag-list/tag-list.component.ts b/src-ui/src/app/components/manage/tag-list/tag-list.component.ts index 0ba0a0855..544e99b58 100644 --- a/src-ui/src/app/components/manage/tag-list/tag-list.component.ts +++ b/src-ui/src/app/components/manage/tag-list/tag-list.component.ts @@ -1,6 +1,7 @@ import { NgClass, NgTemplateOutlet, TitleCasePipe } from '@angular/common' import { Component, inject } from '@angular/core' import { FormsModule, ReactiveFormsModule } from '@angular/forms' +import { RouterModule } from '@angular/router' import { NgbDropdownModule, NgbPaginationModule, @@ -27,6 +28,7 @@ import { ManagementListComponent } from '../management-list/management-list.comp IfPermissionsDirective, FormsModule, ReactiveFormsModule, + RouterModule, NgClass, NgTemplateOutlet, NgbDropdownModule, diff --git a/src-ui/src/app/services/document-list-view.service.spec.ts b/src-ui/src/app/services/document-list-view.service.spec.ts index 82d3ac425..fdbfa2069 100644 --- a/src-ui/src/app/services/document-list-view.service.spec.ts +++ b/src-ui/src/app/services/document-list-view.service.spec.ts @@ -651,4 +651,25 @@ describe('DocumentListViewService', () => { documentListViewService.displayFields = customFields as any expect(documentListViewService.displayFields).toEqual(['custom_field_1']) }) + + it('should generate quick filter URL with filter rules', () => { + const routerSpy = jest.spyOn(router, 'createUrlTree') + const urlTree = documentListViewService.getQuickFilterUrl(filterRules) + expect(routerSpy).toHaveBeenCalledWith(['/documents'], { + queryParams: expect.objectContaining({ + tags__id__all: tags__id__all, + }), + }) + expect(urlTree).toBeDefined() + }) + + it('should generate quick filter URL preserving default state', () => { + documentListViewService.reload() + httpTestingController.expectOne( + `${environment.apiBaseUrl}documents/?page=1&page_size=50&ordering=-created&truncate_content=true` + ) + const urlTree = documentListViewService.getQuickFilterUrl(filterRules) + expect(urlTree).toBeDefined() + expect(router.createUrlTree).toBeDefined() + }) }) diff --git a/src-ui/src/app/services/document-list-view.service.ts b/src-ui/src/app/services/document-list-view.service.ts index 9c64a7641..0bc43b782 100644 --- a/src-ui/src/app/services/document-list-view.service.ts +++ b/src-ui/src/app/services/document-list-view.service.ts @@ -1,5 +1,5 @@ import { Injectable, inject } from '@angular/core' -import { ParamMap, Router } from '@angular/router' +import { ParamMap, Router, UrlTree } from '@angular/router' import { Observable, Subject, first, takeUntil } from 'rxjs' import { DEFAULT_DISPLAY_FIELDS, @@ -483,6 +483,18 @@ export class DocumentListViewService { this.router.navigate(['documents']) } + getQuickFilterUrl(filterRules: FilterRule[]): UrlTree { + const defaultState = { + ...this.defaultListViewState(), + ...this.listViewStates.get(null), + filterRules, + } + const params = paramsFromViewState(defaultState) + return this.router.createUrlTree(['/documents'], { + queryParams: params, + }) + } + getLastPage(): number { return Math.ceil(this.collectionSize / this.pageSize) } diff --git a/src-ui/src/main.ts b/src-ui/src/main.ts index 13accf3b1..0dca8e9cb 100644 --- a/src-ui/src/main.ts +++ b/src-ui/src/main.ts @@ -135,6 +135,7 @@ import { threeDotsVertical, trash, uiRadios, + unlock, upcScan, windowStack, x, @@ -353,6 +354,7 @@ const icons = { threeDotsVertical, trash, uiRadios, + unlock, upcScan, windowStack, x, diff --git a/src/documents/bulk_edit.py b/src/documents/bulk_edit.py index 219947d09..43cb13261 100644 --- a/src/documents/bulk_edit.py +++ b/src/documents/bulk_edit.py @@ -646,6 +646,77 @@ def edit_pdf( return "OK" +def remove_password( + doc_ids: list[int], + password: str, + *, + update_document: bool = False, + delete_original: bool = False, + include_metadata: bool = True, + user: User | None = None, +) -> Literal["OK"]: + """ + Remove password protection from PDF documents. + """ + import pikepdf + + for doc_id in doc_ids: + doc = Document.objects.get(id=doc_id) + try: + logger.info( + f"Attempting password removal from document {doc_ids[0]}", + ) + with pikepdf.open(doc.source_path, password=password) as pdf: + temp_path = doc.source_path.with_suffix(".tmp.pdf") + pdf.remove_unreferenced_resources() + pdf.save(temp_path) + + if update_document: + # replace the original document with the unprotected one + temp_path.replace(doc.source_path) + doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest() + doc.page_count = len(pdf.pages) + doc.save() + update_document_content_maybe_archive_file.delay(document_id=doc.id) + else: + consume_tasks = [] + overrides = ( + DocumentMetadataOverrides().from_document(doc) + if include_metadata + else DocumentMetadataOverrides() + ) + if user is not None: + overrides.owner_id = user.id + + filepath: Path = ( + Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR)) + / f"{doc.id}_unprotected.pdf" + ) + temp_path.replace(filepath) + consume_tasks.append( + consume_file.s( + ConsumableDocument( + source=DocumentSource.ConsumeFolder, + original_file=filepath, + ), + overrides, + ), + ) + + if delete_original: + chord(header=consume_tasks, body=delete.si([doc.id])).delay() + else: + group(consume_tasks).delay() + + except Exception as e: + logger.exception(f"Error removing password from document {doc.id}: {e}") + raise ValueError( + f"An error occurred while removing the password: {e}", + ) from e + + return "OK" + + def reflect_doclinks( document: Document, field: CustomField, diff --git a/src/documents/index.py b/src/documents/index.py index 6b994ac8c..ea26ea926 100644 --- a/src/documents/index.py +++ b/src/documents/index.py @@ -10,6 +10,7 @@ from datetime import time from datetime import timedelta from datetime import timezone from shutil import rmtree +from time import sleep from typing import TYPE_CHECKING from typing import Literal @@ -32,6 +33,7 @@ from whoosh.highlight import HtmlFormatter from whoosh.idsets import BitSet from whoosh.idsets import DocIdSet from whoosh.index import FileIndex +from whoosh.index import LockError from whoosh.index import create_in from whoosh.index import exists_in from whoosh.index import open_dir @@ -97,11 +99,33 @@ def get_schema() -> Schema: def open_index(*, recreate=False) -> FileIndex: - try: - if exists_in(settings.INDEX_DIR) and not recreate: - return open_dir(settings.INDEX_DIR, schema=get_schema()) - except Exception: - logger.exception("Error while opening the index, recreating.") + transient_exceptions = (FileNotFoundError, LockError) + max_retries = 3 + retry_delay = 0.1 + + for attempt in range(max_retries + 1): + try: + if exists_in(settings.INDEX_DIR) and not recreate: + return open_dir(settings.INDEX_DIR, schema=get_schema()) + break + except transient_exceptions as exc: + is_last_attempt = attempt == max_retries or recreate + if is_last_attempt: + logger.exception( + "Error while opening the index after retries, recreating.", + ) + break + + logger.warning( + "Transient error while opening the index (attempt %s/%s): %s. Retrying.", + attempt + 1, + max_retries + 1, + exc, + ) + sleep(retry_delay) + except Exception: + logger.exception("Error while opening the index, recreating.") + break # create_in doesn't handle corrupted indexes very well, remove the directory entirely first if settings.INDEX_DIR.is_dir(): diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py index 5c71de9a9..6e2307c2e 100644 --- a/src/documents/serialisers.py +++ b/src/documents/serialisers.py @@ -1430,6 +1430,7 @@ class BulkEditSerializer( "split", "delete_pages", "edit_pdf", + "remove_password", ], label="Method", write_only=True, @@ -1505,6 +1506,8 @@ class BulkEditSerializer( return bulk_edit.delete_pages elif method == "edit_pdf": return bulk_edit.edit_pdf + elif method == "remove_password": + return bulk_edit.remove_password else: # pragma: no cover # This will never happen as it is handled by the ChoiceField raise serializers.ValidationError("Unsupported method.") @@ -1701,6 +1704,12 @@ class BulkEditSerializer( f"Page {op['page']} is out of bounds for document with {doc.page_count} pages.", ) + def validate_parameters_remove_password(self, parameters): + if "password" not in parameters: + raise serializers.ValidationError("password not specified") + if not isinstance(parameters["password"], str): + raise serializers.ValidationError("password must be a string") + def validate(self, attrs): method = attrs["method"] parameters = attrs["parameters"] @@ -1741,6 +1750,8 @@ class BulkEditSerializer( "Edit PDF method only supports one document", ) self._validate_parameters_edit_pdf(parameters, attrs["documents"][0]) + elif method == bulk_edit.remove_password: + self.validate_parameters_remove_password(parameters) return attrs diff --git a/src/documents/tasks.py b/src/documents/tasks.py index 6d0e2f0c0..df2452450 100644 --- a/src/documents/tasks.py +++ b/src/documents/tasks.py @@ -508,7 +508,7 @@ def check_scheduled_workflows(): trigger.schedule_is_recurring and workflow_runs.exists() and ( - workflow_runs.last().run_at + workflow_runs.first().run_at > now - datetime.timedelta( days=trigger.schedule_recurring_interval_days, diff --git a/src/documents/tests/test_api_bulk_edit.py b/src/documents/tests/test_api_bulk_edit.py index 945f06b67..2ba9f1af6 100644 --- a/src/documents/tests/test_api_bulk_edit.py +++ b/src/documents/tests/test_api_bulk_edit.py @@ -1582,6 +1582,58 @@ class TestBulkEditAPI(DirectoriesMixin, APITestCase): self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) self.assertIn(b"out of bounds", response.content) + @mock.patch("documents.serialisers.bulk_edit.remove_password") + def test_remove_password(self, m): + self.setup_mock(m, "remove_password") + response = self.client.post( + "/api/documents/bulk_edit/", + json.dumps( + { + "documents": [self.doc2.id], + "method": "remove_password", + "parameters": {"password": "secret", "update_document": True}, + }, + ), + content_type="application/json", + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + m.assert_called_once() + args, kwargs = m.call_args + self.assertCountEqual(args[0], [self.doc2.id]) + self.assertEqual(kwargs["password"], "secret") + self.assertTrue(kwargs["update_document"]) + self.assertEqual(kwargs["user"], self.user) + + def test_remove_password_invalid_params(self): + response = self.client.post( + "/api/documents/bulk_edit/", + json.dumps( + { + "documents": [self.doc2.id], + "method": "remove_password", + "parameters": {}, + }, + ), + content_type="application/json", + ) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertIn(b"password not specified", response.content) + + response = self.client.post( + "/api/documents/bulk_edit/", + json.dumps( + { + "documents": [self.doc2.id], + "method": "remove_password", + "parameters": {"password": 123}, + }, + ), + content_type="application/json", + ) + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertIn(b"password must be a string", response.content) + @override_settings(AUDIT_LOG_ENABLED=True) def test_bulk_edit_audit_log_enabled_simple_field(self): """ diff --git a/src/documents/tests/test_bulk_edit.py b/src/documents/tests/test_bulk_edit.py index c220c1e9b..bf5033bdc 100644 --- a/src/documents/tests/test_bulk_edit.py +++ b/src/documents/tests/test_bulk_edit.py @@ -1,3 +1,4 @@ +import hashlib import shutil from datetime import date from pathlib import Path @@ -1066,3 +1067,147 @@ class TestPDFActions(DirectoriesMixin, TestCase): bulk_edit.edit_pdf(doc_ids, operations, update_document=True) mock_group.assert_not_called() mock_consume_file.assert_not_called() + + @mock.patch("documents.bulk_edit.update_document_content_maybe_archive_file.delay") + @mock.patch("pikepdf.open") + def test_remove_password_update_document(self, mock_open, mock_update_document): + doc = self.doc1 + original_checksum = doc.checksum + + fake_pdf = mock.MagicMock() + fake_pdf.pages = [mock.Mock(), mock.Mock(), mock.Mock()] + + def save_side_effect(target_path): + Path(target_path).write_bytes(b"new pdf content") + + fake_pdf.save.side_effect = save_side_effect + mock_open.return_value.__enter__.return_value = fake_pdf + + result = bulk_edit.remove_password( + [doc.id], + password="secret", + update_document=True, + ) + + self.assertEqual(result, "OK") + mock_open.assert_called_once_with(doc.source_path, password="secret") + fake_pdf.remove_unreferenced_resources.assert_called_once() + doc.refresh_from_db() + self.assertNotEqual(doc.checksum, original_checksum) + expected_checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest() + self.assertEqual(doc.checksum, expected_checksum) + self.assertEqual(doc.page_count, len(fake_pdf.pages)) + mock_update_document.assert_called_once_with(document_id=doc.id) + + @mock.patch("documents.bulk_edit.chord") + @mock.patch("documents.bulk_edit.group") + @mock.patch("documents.tasks.consume_file.s") + @mock.patch("documents.bulk_edit.tempfile.mkdtemp") + @mock.patch("pikepdf.open") + def test_remove_password_creates_consumable_document( + self, + mock_open, + mock_mkdtemp, + mock_consume_file, + mock_group, + mock_chord, + ): + doc = self.doc2 + temp_dir = self.dirs.scratch_dir / "remove-password" + temp_dir.mkdir(parents=True, exist_ok=True) + mock_mkdtemp.return_value = str(temp_dir) + + fake_pdf = mock.MagicMock() + fake_pdf.pages = [mock.Mock(), mock.Mock()] + + def save_side_effect(target_path): + Path(target_path).write_bytes(b"password removed") + + fake_pdf.save.side_effect = save_side_effect + mock_open.return_value.__enter__.return_value = fake_pdf + mock_group.return_value.delay.return_value = None + + user = User.objects.create(username="owner") + + result = bulk_edit.remove_password( + [doc.id], + password="secret", + include_metadata=False, + update_document=False, + delete_original=False, + user=user, + ) + + self.assertEqual(result, "OK") + mock_open.assert_called_once_with(doc.source_path, password="secret") + mock_consume_file.assert_called_once() + consume_args, _ = mock_consume_file.call_args + consumable_document = consume_args[0] + overrides = consume_args[1] + expected_path = temp_dir / f"{doc.id}_unprotected.pdf" + self.assertTrue(expected_path.exists()) + self.assertEqual( + Path(consumable_document.original_file).resolve(), + expected_path.resolve(), + ) + self.assertEqual(overrides.owner_id, user.id) + mock_group.assert_called_once_with([mock_consume_file.return_value]) + mock_group.return_value.delay.assert_called_once() + mock_chord.assert_not_called() + + @mock.patch("documents.bulk_edit.delete") + @mock.patch("documents.bulk_edit.chord") + @mock.patch("documents.bulk_edit.group") + @mock.patch("documents.tasks.consume_file.s") + @mock.patch("documents.bulk_edit.tempfile.mkdtemp") + @mock.patch("pikepdf.open") + def test_remove_password_deletes_original( + self, + mock_open, + mock_mkdtemp, + mock_consume_file, + mock_group, + mock_chord, + mock_delete, + ): + doc = self.doc2 + temp_dir = self.dirs.scratch_dir / "remove-password-delete" + temp_dir.mkdir(parents=True, exist_ok=True) + mock_mkdtemp.return_value = str(temp_dir) + + fake_pdf = mock.MagicMock() + fake_pdf.pages = [mock.Mock(), mock.Mock()] + + def save_side_effect(target_path): + Path(target_path).write_bytes(b"password removed") + + fake_pdf.save.side_effect = save_side_effect + mock_open.return_value.__enter__.return_value = fake_pdf + mock_chord.return_value.delay.return_value = None + + result = bulk_edit.remove_password( + [doc.id], + password="secret", + include_metadata=False, + update_document=False, + delete_original=True, + ) + + self.assertEqual(result, "OK") + mock_open.assert_called_once_with(doc.source_path, password="secret") + mock_consume_file.assert_called_once() + mock_group.assert_not_called() + mock_chord.assert_called_once() + mock_chord.return_value.delay.assert_called_once() + mock_delete.si.assert_called_once_with([doc.id]) + + @mock.patch("pikepdf.open") + def test_remove_password_open_failure(self, mock_open): + mock_open.side_effect = RuntimeError("wrong password") + + with self.assertLogs("paperless.bulk_edit", level="ERROR") as cm: + with self.assertRaises(ValueError) as exc: + bulk_edit.remove_password([self.doc1.id], password="secret") + + self.assertIn("wrong password", str(exc.exception)) + self.assertIn("Error removing password from document", cm.output[0]) diff --git a/src/documents/tests/test_index.py b/src/documents/tests/test_index.py index f216feedb..3167bb762 100644 --- a/src/documents/tests/test_index.py +++ b/src/documents/tests/test_index.py @@ -1,6 +1,7 @@ from datetime import datetime from unittest import mock +from django.conf import settings from django.contrib.auth.models import User from django.test import SimpleTestCase from django.test import TestCase @@ -251,3 +252,120 @@ class TestRewriteNaturalDateKeywords(SimpleTestCase): result = self._rewrite_with_now("added:today", fixed_now) # Should convert to UTC properly self.assertIn("added:[20250719", result) + + +class TestIndexResilience(DirectoriesMixin, SimpleTestCase): + def _assert_recreate_called(self, mock_create_in): + mock_create_in.assert_called_once() + path_arg, schema_arg = mock_create_in.call_args.args + self.assertEqual(path_arg, settings.INDEX_DIR) + self.assertEqual(schema_arg.__class__.__name__, "Schema") + + def test_transient_missing_segment_does_not_force_recreate(self): + """ + GIVEN: + - Index directory exists + WHEN: + - open_index is called + - Opening the index raises FileNotFoundError once due to a + transient missing segment + THEN: + - Index is opened successfully on retry + - Index is not recreated + """ + file_marker = settings.INDEX_DIR / "file_marker.txt" + file_marker.write_text("keep") + expected_index = object() + + with ( + mock.patch("documents.index.exists_in", return_value=True), + mock.patch( + "documents.index.open_dir", + side_effect=[FileNotFoundError("missing"), expected_index], + ) as mock_open_dir, + mock.patch( + "documents.index.create_in", + ) as mock_create_in, + mock.patch( + "documents.index.rmtree", + ) as mock_rmtree, + ): + ix = index.open_index() + + self.assertIs(ix, expected_index) + self.assertGreaterEqual(mock_open_dir.call_count, 2) + mock_rmtree.assert_not_called() + mock_create_in.assert_not_called() + self.assertEqual(file_marker.read_text(), "keep") + + def test_transient_errors_exhaust_retries_and_recreate(self): + """ + GIVEN: + - Index directory exists + WHEN: + - open_index is called + - Opening the index raises FileNotFoundError multiple times due to + transient missing segments + THEN: + - Index is recreated after retries are exhausted + """ + recreated_index = object() + + with ( + self.assertLogs("paperless.index", level="ERROR") as cm, + mock.patch("documents.index.exists_in", return_value=True), + mock.patch( + "documents.index.open_dir", + side_effect=FileNotFoundError("missing"), + ) as mock_open_dir, + mock.patch("documents.index.rmtree") as mock_rmtree, + mock.patch( + "documents.index.create_in", + return_value=recreated_index, + ) as mock_create_in, + ): + ix = index.open_index() + + self.assertIs(ix, recreated_index) + self.assertEqual(mock_open_dir.call_count, 4) + mock_rmtree.assert_called_once_with(settings.INDEX_DIR) + self._assert_recreate_called(mock_create_in) + self.assertIn( + "Error while opening the index after retries, recreating.", + cm.output[0], + ) + + def test_non_transient_error_recreates_index(self): + """ + GIVEN: + - Index directory exists + WHEN: + - open_index is called + - Opening the index raises a "non-transient" error + THEN: + - Index is recreated + """ + recreated_index = object() + + with ( + self.assertLogs("paperless.index", level="ERROR") as cm, + mock.patch("documents.index.exists_in", return_value=True), + mock.patch( + "documents.index.open_dir", + side_effect=RuntimeError("boom"), + ), + mock.patch("documents.index.rmtree") as mock_rmtree, + mock.patch( + "documents.index.create_in", + return_value=recreated_index, + ) as mock_create_in, + ): + ix = index.open_index() + + self.assertIs(ix, recreated_index) + mock_rmtree.assert_called_once_with(settings.INDEX_DIR) + self._assert_recreate_called(mock_create_in) + self.assertIn( + "Error while opening the index, recreating.", + cm.output[0], + ) diff --git a/src/documents/tests/test_workflows.py b/src/documents/tests/test_workflows.py index 249183b6e..deb40a165 100644 --- a/src/documents/tests/test_workflows.py +++ b/src/documents/tests/test_workflows.py @@ -2094,6 +2094,68 @@ class TestWorkflows( doc.refresh_from_db() self.assertIsNone(doc.owner) + def test_workflow_scheduled_recurring_respects_latest_run(self): + """ + GIVEN: + - Scheduled workflow marked as recurring with a 1-day interval + - Document that matches the trigger + - Two prior runs exist: one 2 days ago and one 1 hour ago + WHEN: + - Scheduled workflows are checked again + THEN: + - Workflow does not run because the most recent run is inside the interval + """ + trigger = WorkflowTrigger.objects.create( + type=WorkflowTrigger.WorkflowTriggerType.SCHEDULED, + schedule_date_field=WorkflowTrigger.ScheduleDateField.CREATED, + schedule_is_recurring=True, + schedule_recurring_interval_days=1, + ) + action = WorkflowAction.objects.create( + assign_title="Doc assign owner", + assign_owner=self.user2, + ) + w = Workflow.objects.create( + name="Workflow 1", + order=0, + ) + w.triggers.add(trigger) + w.actions.add(action) + w.save() + + doc = Document.objects.create( + title="sample test", + correspondent=self.c, + original_filename="sample.pdf", + created=timezone.now().date() - timedelta(days=3), + ) + + WorkflowRun.objects.create( + workflow=w, + document=doc, + type=WorkflowTrigger.WorkflowTriggerType.SCHEDULED, + run_at=timezone.now() - timedelta(days=2), + ) + WorkflowRun.objects.create( + workflow=w, + document=doc, + type=WorkflowTrigger.WorkflowTriggerType.SCHEDULED, + run_at=timezone.now() - timedelta(hours=1), + ) + + tasks.check_scheduled_workflows() + + doc.refresh_from_db() + self.assertIsNone(doc.owner) + self.assertEqual( + WorkflowRun.objects.filter( + workflow=w, + document=doc, + type=WorkflowTrigger.WorkflowTriggerType.SCHEDULED, + ).count(), + 2, + ) + def test_workflow_scheduled_trigger_negative_offset_customfield(self): """ GIVEN: diff --git a/src/documents/views.py b/src/documents/views.py index 4f3be2806..0488de1d9 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -1631,6 +1631,7 @@ class BulkEditView(PassUserMixin): "merge": None, "edit_pdf": "checksum", "reprocess": "checksum", + "remove_password": "checksum", } permission_classes = (IsAuthenticated,) @@ -1649,6 +1650,7 @@ class BulkEditView(PassUserMixin): bulk_edit.split, bulk_edit.merge, bulk_edit.edit_pdf, + bulk_edit.remove_password, ]: parameters["user"] = user @@ -1677,6 +1679,7 @@ class BulkEditView(PassUserMixin): bulk_edit.rotate, bulk_edit.delete_pages, bulk_edit.edit_pdf, + bulk_edit.remove_password, ] ) or ( @@ -1693,7 +1696,7 @@ class BulkEditView(PassUserMixin): and ( method in [bulk_edit.split, bulk_edit.merge] or ( - method == bulk_edit.edit_pdf + method in [bulk_edit.edit_pdf, bulk_edit.remove_password] and not parameters["update_document"] ) ) diff --git a/src/locale/en_US/LC_MESSAGES/django.po b/src/locale/en_US/LC_MESSAGES/django.po index 75cd392ad..850c20ed5 100644 --- a/src/locale/en_US/LC_MESSAGES/django.po +++ b/src/locale/en_US/LC_MESSAGES/django.po @@ -2,7 +2,7 @@ msgid "" msgstr "" "Project-Id-Version: paperless-ngx\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2026-01-06 17:11+0000\n" +"POT-Creation-Date: 2026-01-08 21:50+0000\n" "PO-Revision-Date: 2022-02-17 04:17\n" "Last-Translator: \n" "Language-Team: English\n" @@ -1223,31 +1223,31 @@ msgstr "" msgid "Invalid color." msgstr "" -#: documents/serialisers.py:1835 +#: documents/serialisers.py:1846 #, python-format msgid "File type %(type)s not supported" msgstr "" -#: documents/serialisers.py:1879 +#: documents/serialisers.py:1890 #, python-format msgid "Custom field id must be an integer: %(id)s" msgstr "" -#: documents/serialisers.py:1886 +#: documents/serialisers.py:1897 #, python-format msgid "Custom field with id %(id)s does not exist" msgstr "" -#: documents/serialisers.py:1903 documents/serialisers.py:1913 +#: documents/serialisers.py:1914 documents/serialisers.py:1924 msgid "" "Custom fields must be a list of integers or an object mapping ids to values." msgstr "" -#: documents/serialisers.py:1908 +#: documents/serialisers.py:1919 msgid "Some custom fields don't exist or were specified twice." msgstr "" -#: documents/serialisers.py:2023 +#: documents/serialisers.py:2034 msgid "Invalid variable detected." msgstr "" @@ -1702,151 +1702,151 @@ msgstr "" msgid "paperless application settings" msgstr "" -#: paperless/settings.py:767 +#: paperless/settings.py:768 msgid "English (US)" msgstr "" -#: paperless/settings.py:768 +#: paperless/settings.py:769 msgid "Arabic" msgstr "" -#: paperless/settings.py:769 +#: paperless/settings.py:770 msgid "Afrikaans" msgstr "" -#: paperless/settings.py:770 +#: paperless/settings.py:771 msgid "Belarusian" msgstr "" -#: paperless/settings.py:771 +#: paperless/settings.py:772 msgid "Bulgarian" msgstr "" -#: paperless/settings.py:772 +#: paperless/settings.py:773 msgid "Catalan" msgstr "" -#: paperless/settings.py:773 +#: paperless/settings.py:774 msgid "Czech" msgstr "" -#: paperless/settings.py:774 +#: paperless/settings.py:775 msgid "Danish" msgstr "" -#: paperless/settings.py:775 +#: paperless/settings.py:776 msgid "German" msgstr "" -#: paperless/settings.py:776 +#: paperless/settings.py:777 msgid "Greek" msgstr "" -#: paperless/settings.py:777 +#: paperless/settings.py:778 msgid "English (GB)" msgstr "" -#: paperless/settings.py:778 +#: paperless/settings.py:779 msgid "Spanish" msgstr "" -#: paperless/settings.py:779 +#: paperless/settings.py:780 msgid "Persian" msgstr "" -#: paperless/settings.py:780 +#: paperless/settings.py:781 msgid "Finnish" msgstr "" -#: paperless/settings.py:781 +#: paperless/settings.py:782 msgid "French" msgstr "" -#: paperless/settings.py:782 +#: paperless/settings.py:783 msgid "Hungarian" msgstr "" -#: paperless/settings.py:783 +#: paperless/settings.py:784 msgid "Indonesian" msgstr "" -#: paperless/settings.py:784 +#: paperless/settings.py:785 msgid "Italian" msgstr "" -#: paperless/settings.py:785 +#: paperless/settings.py:786 msgid "Japanese" msgstr "" -#: paperless/settings.py:786 +#: paperless/settings.py:787 msgid "Korean" msgstr "" -#: paperless/settings.py:787 +#: paperless/settings.py:788 msgid "Luxembourgish" msgstr "" -#: paperless/settings.py:788 +#: paperless/settings.py:789 msgid "Norwegian" msgstr "" -#: paperless/settings.py:789 +#: paperless/settings.py:790 msgid "Dutch" msgstr "" -#: paperless/settings.py:790 +#: paperless/settings.py:791 msgid "Polish" msgstr "" -#: paperless/settings.py:791 +#: paperless/settings.py:792 msgid "Portuguese (Brazil)" msgstr "" -#: paperless/settings.py:792 +#: paperless/settings.py:793 msgid "Portuguese" msgstr "" -#: paperless/settings.py:793 +#: paperless/settings.py:794 msgid "Romanian" msgstr "" -#: paperless/settings.py:794 +#: paperless/settings.py:795 msgid "Russian" msgstr "" -#: paperless/settings.py:795 +#: paperless/settings.py:796 msgid "Slovak" msgstr "" -#: paperless/settings.py:796 +#: paperless/settings.py:797 msgid "Slovenian" msgstr "" -#: paperless/settings.py:797 +#: paperless/settings.py:798 msgid "Serbian" msgstr "" -#: paperless/settings.py:798 +#: paperless/settings.py:799 msgid "Swedish" msgstr "" -#: paperless/settings.py:799 +#: paperless/settings.py:800 msgid "Turkish" msgstr "" -#: paperless/settings.py:800 +#: paperless/settings.py:801 msgid "Ukrainian" msgstr "" -#: paperless/settings.py:801 +#: paperless/settings.py:802 msgid "Vietnamese" msgstr "" -#: paperless/settings.py:802 +#: paperless/settings.py:803 msgid "Chinese Simplified" msgstr "" -#: paperless/settings.py:803 +#: paperless/settings.py:804 msgid "Chinese Traditional" msgstr "" diff --git a/src/paperless/settings.py b/src/paperless/settings.py index d335a0d92..9b94ebb7b 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -334,6 +334,7 @@ INSTALLED_APPS = [ "paperless_tesseract.apps.PaperlessTesseractConfig", "paperless_text.apps.PaperlessTextConfig", "paperless_mail.apps.PaperlessMailConfig", + "paperless_remote.apps.PaperlessRemoteParserConfig", "django.contrib.admin", "rest_framework", "rest_framework.authtoken", @@ -1430,6 +1431,13 @@ WEBHOOKS_ALLOW_INTERNAL_REQUESTS = __get_boolean( "true", ) +############################################################################### +# Remote Parser # +############################################################################### +REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE") +REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY") +REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT") + ################################################################################ # AI Settings # ################################################################################ diff --git a/src/paperless_remote/__init__.py b/src/paperless_remote/__init__.py new file mode 100644 index 000000000..5380ea5ac --- /dev/null +++ b/src/paperless_remote/__init__.py @@ -0,0 +1,4 @@ +# this is here so that django finds the checks. +from paperless_remote.checks import check_remote_parser_configured + +__all__ = ["check_remote_parser_configured"] diff --git a/src/paperless_remote/apps.py b/src/paperless_remote/apps.py new file mode 100644 index 000000000..8cd3199f9 --- /dev/null +++ b/src/paperless_remote/apps.py @@ -0,0 +1,14 @@ +from django.apps import AppConfig + +from paperless_remote.signals import remote_consumer_declaration + + +class PaperlessRemoteParserConfig(AppConfig): + name = "paperless_remote" + + def ready(self): + from documents.signals import document_consumer_declaration + + document_consumer_declaration.connect(remote_consumer_declaration) + + AppConfig.ready(self) diff --git a/src/paperless_remote/checks.py b/src/paperless_remote/checks.py new file mode 100644 index 000000000..b9abb0592 --- /dev/null +++ b/src/paperless_remote/checks.py @@ -0,0 +1,17 @@ +from django.conf import settings +from django.core.checks import Error +from django.core.checks import register + + +@register() +def check_remote_parser_configured(app_configs, **kwargs): + if settings.REMOTE_OCR_ENGINE == "azureai" and not ( + settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY + ): + return [ + Error( + "Azure AI remote parser requires endpoint and API key to be configured.", + ), + ] + + return [] diff --git a/src/paperless_remote/parsers.py b/src/paperless_remote/parsers.py new file mode 100644 index 000000000..493b7d7bb --- /dev/null +++ b/src/paperless_remote/parsers.py @@ -0,0 +1,118 @@ +from pathlib import Path + +from django.conf import settings + +from paperless_tesseract.parsers import RasterisedDocumentParser + + +class RemoteEngineConfig: + def __init__( + self, + engine: str, + api_key: str | None = None, + endpoint: str | None = None, + ): + self.engine = engine + self.api_key = api_key + self.endpoint = endpoint + + def engine_is_valid(self): + valid = self.engine in ["azureai"] and self.api_key is not None + if self.engine == "azureai": + valid = valid and self.endpoint is not None + return valid + + +class RemoteDocumentParser(RasterisedDocumentParser): + """ + This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision + as this is the only service that provides a remote OCR API with text-embedded PDF output. + """ + + logging_name = "paperless.parsing.remote" + + def get_settings(self) -> RemoteEngineConfig: + """ + Returns the configuration for the remote OCR engine, loaded from Django settings. + """ + return RemoteEngineConfig( + engine=settings.REMOTE_OCR_ENGINE, + api_key=settings.REMOTE_OCR_API_KEY, + endpoint=settings.REMOTE_OCR_ENDPOINT, + ) + + def supported_mime_types(self): + if self.settings.engine_is_valid(): + return { + "application/pdf": ".pdf", + "image/png": ".png", + "image/jpeg": ".jpg", + "image/tiff": ".tiff", + "image/bmp": ".bmp", + "image/gif": ".gif", + "image/webp": ".webp", + } + else: + return {} + + def azure_ai_vision_parse( + self, + file: Path, + ) -> str | None: + """ + Uses Azure AI Vision to parse the document and return the text content. + It requests a searchable PDF output with embedded text. + The PDF is saved to the archive_path attribute. + Returns the text content extracted from the document. + If the parsing fails, it returns None. + """ + from azure.ai.documentintelligence import DocumentIntelligenceClient + from azure.ai.documentintelligence.models import AnalyzeDocumentRequest + from azure.ai.documentintelligence.models import AnalyzeOutputOption + from azure.ai.documentintelligence.models import DocumentContentFormat + from azure.core.credentials import AzureKeyCredential + + client = DocumentIntelligenceClient( + endpoint=self.settings.endpoint, + credential=AzureKeyCredential(self.settings.api_key), + ) + + try: + with file.open("rb") as f: + analyze_request = AnalyzeDocumentRequest(bytes_source=f.read()) + poller = client.begin_analyze_document( + model_id="prebuilt-read", + body=analyze_request, + output_content_format=DocumentContentFormat.TEXT, + output=[AnalyzeOutputOption.PDF], # request searchable PDF output + content_type="application/json", + ) + + poller.wait() + result_id = poller.details["operation_id"] + result = poller.result() + + # Download the PDF with embedded text + self.archive_path = self.tempdir / "archive.pdf" + with self.archive_path.open("wb") as f: + for chunk in client.get_analyze_result_pdf( + model_id="prebuilt-read", + result_id=result_id, + ): + f.write(chunk) + return result.content + except Exception as e: + self.log.error(f"Azure AI Vision parsing failed: {e}") + finally: + client.close() + + return None + + def parse(self, document_path: Path, mime_type, file_name=None): + if not self.settings.engine_is_valid(): + self.log.warning( + "No valid remote parser engine is configured, content will be empty.", + ) + self.text = "" + elif self.settings.engine == "azureai": + self.text = self.azure_ai_vision_parse(document_path) diff --git a/src/paperless_remote/signals.py b/src/paperless_remote/signals.py new file mode 100644 index 000000000..81955a479 --- /dev/null +++ b/src/paperless_remote/signals.py @@ -0,0 +1,18 @@ +def get_parser(*args, **kwargs): + from paperless_remote.parsers import RemoteDocumentParser + + return RemoteDocumentParser(*args, **kwargs) + + +def get_supported_mime_types(): + from paperless_remote.parsers import RemoteDocumentParser + + return RemoteDocumentParser(None).supported_mime_types() + + +def remote_consumer_declaration(sender, **kwargs): + return { + "parser": get_parser, + "weight": 5, + "mime_types": get_supported_mime_types(), + } diff --git a/src/paperless_remote/tests/__init__.py b/src/paperless_remote/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/paperless_remote/tests/samples/simple-digital.pdf b/src/paperless_remote/tests/samples/simple-digital.pdf new file mode 100644 index 000000000..e450de482 Binary files /dev/null and b/src/paperless_remote/tests/samples/simple-digital.pdf differ diff --git a/src/paperless_remote/tests/test_checks.py b/src/paperless_remote/tests/test_checks.py new file mode 100644 index 000000000..8a257952e --- /dev/null +++ b/src/paperless_remote/tests/test_checks.py @@ -0,0 +1,24 @@ +from unittest import TestCase + +from django.test import override_settings + +from paperless_remote import check_remote_parser_configured + + +class TestChecks(TestCase): + @override_settings(REMOTE_OCR_ENGINE=None) + def test_no_engine(self): + msgs = check_remote_parser_configured(None) + self.assertEqual(len(msgs), 0) + + @override_settings(REMOTE_OCR_ENGINE="azureai") + @override_settings(REMOTE_OCR_API_KEY="somekey") + @override_settings(REMOTE_OCR_ENDPOINT=None) + def test_azure_no_endpoint(self): + msgs = check_remote_parser_configured(None) + self.assertEqual(len(msgs), 1) + self.assertTrue( + msgs[0].msg.startswith( + "Azure AI remote parser requires endpoint and API key to be configured.", + ), + ) diff --git a/src/paperless_remote/tests/test_parser.py b/src/paperless_remote/tests/test_parser.py new file mode 100644 index 000000000..793778ec3 --- /dev/null +++ b/src/paperless_remote/tests/test_parser.py @@ -0,0 +1,128 @@ +import uuid +from pathlib import Path +from unittest import mock + +from django.test import TestCase +from django.test import override_settings + +from documents.tests.utils import DirectoriesMixin +from documents.tests.utils import FileSystemAssertsMixin +from paperless_remote.parsers import RemoteDocumentParser +from paperless_remote.signals import get_parser + + +class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): + SAMPLE_FILES = Path(__file__).resolve().parent / "samples" + + def assertContainsStrings(self, content: str, strings: list[str]): + # Asserts that all strings appear in content, in the given order. + indices = [] + for s in strings: + if s in content: + indices.append(content.index(s)) + else: + self.fail(f"'{s}' is not in '{content}'") + self.assertListEqual(indices, sorted(indices)) + + @mock.patch("paperless_tesseract.parsers.run_subprocess") + @mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient") + def test_get_text_with_azure(self, mock_client_cls, mock_subprocess): + # Arrange mock Azure client + mock_client = mock.Mock() + mock_client_cls.return_value = mock_client + + # Simulate poller result and its `.details` + mock_poller = mock.Mock() + mock_poller.wait.return_value = None + mock_poller.details = {"operation_id": "fake-op-id"} + mock_client.begin_analyze_document.return_value = mock_poller + mock_poller.result.return_value.content = "This is a test document." + + # Return dummy PDF bytes + mock_client.get_analyze_result_pdf.return_value = [ + b"%PDF-", + b"1.7 ", + b"FAKEPDF", + ] + + # Simulate pdftotext by writing dummy text to sidecar file + def fake_run(cmd, *args, **kwargs): + with Path(cmd[-1]).open("w", encoding="utf-8") as f: + f.write("This is a test document.") + + mock_subprocess.side_effect = fake_run + + with override_settings( + REMOTE_OCR_ENGINE="azureai", + REMOTE_OCR_API_KEY="somekey", + REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com", + ): + parser = get_parser(uuid.uuid4()) + parser.parse( + self.SAMPLE_FILES / "simple-digital.pdf", + "application/pdf", + ) + + self.assertContainsStrings( + parser.text.strip(), + ["This is a test document."], + ) + + @mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient") + def test_get_text_with_azure_error_logged_and_returns_none(self, mock_client_cls): + mock_client = mock.Mock() + mock_client.begin_analyze_document.side_effect = RuntimeError("fail") + mock_client_cls.return_value = mock_client + + with override_settings( + REMOTE_OCR_ENGINE="azureai", + REMOTE_OCR_API_KEY="somekey", + REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com", + ): + parser = get_parser(uuid.uuid4()) + with mock.patch.object(parser.log, "error") as mock_log_error: + parser.parse( + self.SAMPLE_FILES / "simple-digital.pdf", + "application/pdf", + ) + + self.assertIsNone(parser.text) + mock_client.begin_analyze_document.assert_called_once() + mock_client.close.assert_called_once() + mock_log_error.assert_called_once() + self.assertIn( + "Azure AI Vision parsing failed", + mock_log_error.call_args[0][0], + ) + + @override_settings( + REMOTE_OCR_ENGINE="azureai", + REMOTE_OCR_API_KEY="key", + REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com", + ) + def test_supported_mime_types_valid_config(self): + parser = RemoteDocumentParser(uuid.uuid4()) + expected_types = { + "application/pdf": ".pdf", + "image/png": ".png", + "image/jpeg": ".jpg", + "image/tiff": ".tiff", + "image/bmp": ".bmp", + "image/gif": ".gif", + "image/webp": ".webp", + } + self.assertEqual(parser.supported_mime_types(), expected_types) + + def test_supported_mime_types_invalid_config(self): + parser = get_parser(uuid.uuid4()) + self.assertEqual(parser.supported_mime_types(), {}) + + @override_settings( + REMOTE_OCR_ENGINE=None, + REMOTE_OCR_API_KEY=None, + REMOTE_OCR_ENDPOINT=None, + ) + def test_parse_with_invalid_config(self): + parser = get_parser(uuid.uuid4()) + parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf") + self.assertEqual(parser.text, "") diff --git a/uv.lock b/uv.lock index 735ca55d9..1a6b6b1d7 100644 --- a/uv.lock +++ b/uv.lock @@ -201,6 +201,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/02/ff/1175b0b7371e46244032d43a56862d0af455823b5280a50c63d99cc50f18/automat-25.4.16-py3-none-any.whl", hash = "sha256:04e9bce696a8d5671ee698005af6e5a9fa15354140a87f4870744604dcdd3ba1", size = 42842, upload-time = "2025-04-16T20:12:14.447Z" }, ] +[[package]] +name = "azure-ai-documentintelligence" +version = "1.0.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "azure-core", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "isodate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/44/7b/8115cd713e2caa5e44def85f2b7ebd02a74ae74d7113ba20bdd41fd6dd80/azure_ai_documentintelligence-1.0.2.tar.gz", hash = "sha256:4d75a2513f2839365ebabc0e0e1772f5601b3a8c9a71e75da12440da13b63484", size = 170940 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/75/c9ec040f23082f54ffb1977ff8f364c2d21c79a640a13d1c1809e7fd6b1a/azure_ai_documentintelligence-1.0.2-py3-none-any.whl", hash = "sha256:e1fb446abbdeccc9759d897898a0fe13141ed29f9ad11fc705f951925822ed59", size = 106005 }, +] + +[[package]] +name = "azure-core" +version = "1.33.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "six", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, + { name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/75/aa/7c9db8edd626f1a7d99d09ef7926f6f4fb34d5f9fa00dc394afdfe8e2a80/azure_core-1.33.0.tar.gz", hash = "sha256:f367aa07b5e3005fec2c1e184b882b0b039910733907d001c20fb08ebb8c0eb9", size = 295633 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/07/b7/76b7e144aa53bd206bf1ce34fa75350472c3f69bf30e5c8c18bc9881035d/azure_core-1.33.0-py3-none-any.whl", hash = "sha256:9b5b6d0223a1d38c37500e6971118c1e0f13f54951e6893968b38910bc9cda8f", size = 207071 }, +] + [[package]] name = "babel" version = "2.17.0" @@ -1817,6 +1845,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/fc/4e5a141c3f7c7bed550ac1f69e599e92b6be449dd4677ec09f325cad0955/inotifyrecursive-0.3.5-py3-none-any.whl", hash = "sha256:7e5f4a2e1dc2bef0efa3b5f6b339c41fb4599055a2b54909d020e9e932cc8d2f", size = 8009, upload-time = "2020-11-20T12:38:46.981Z" }, ] +[[package]] +name = "isodate" +version = "0.7.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320 }, +] + [[package]] name = "jinja2" version = "3.1.6" @@ -2927,6 +2964,7 @@ name = "paperless-ngx" version = "2.20.3" source = { virtual = "." } dependencies = [ + { name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "babel", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, @@ -3073,6 +3111,7 @@ typing = [ [package.metadata] requires-dist = [ + { name = "azure-ai-documentintelligence", specifier = ">=1.0.2" }, { name = "babel", specifier = ">=2.17" }, { name = "bleach", specifier = "~=6.3.0" }, { name = "celery", extras = ["redis"], specifier = "~=5.5.1" },