mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-01-02 14:28:14 -06:00
Compare commits
69 Commits
feature-pw
...
feature-re
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
14cfc96a5e | ||
|
|
72fd05501b | ||
|
|
a3c19b1e2d | ||
|
|
2e6458dbcc | ||
|
|
8471507115 | ||
|
|
2cd96610f6 | ||
|
|
9eb81d5458 | ||
|
|
6a5ea49715 | ||
|
|
7d2fe630a5 | ||
|
|
c29dd5485b | ||
|
|
cef100a955 | ||
|
|
4f53d1b6ee | ||
|
|
23cea77548 | ||
|
|
4900af93c6 | ||
|
|
ef834ae808 | ||
|
|
0537e87cb5 | ||
|
|
b4da5c3cd1 | ||
|
|
251b0fb3d6 | ||
|
|
32bdf11f7f | ||
|
|
0627ca69f5 | ||
|
|
f5525bbdff | ||
|
|
a21a2a41a8 | ||
|
|
cc73ed8b86 | ||
|
|
0c706b2316 | ||
|
|
85b7b6874d | ||
|
|
56b26185fa | ||
|
|
6537fade7b | ||
|
|
9f8090816f | ||
|
|
1de7c52478 | ||
|
|
9aaaa6f069 | ||
|
|
c3a20b7797 | ||
|
|
476556379b | ||
|
|
e5cafff043 | ||
|
|
8e0d574e99 | ||
|
|
8a5820328e | ||
|
|
809d62a2f4 | ||
|
|
0d87f94b9b | ||
|
|
315b90f8e5 | ||
|
|
47b2d2964b | ||
|
|
e05639ae4e | ||
|
|
f400a8cb2f | ||
|
|
26abcf5612 | ||
|
|
afde52430d | ||
|
|
716f2da652 | ||
|
|
c54073b7c2 | ||
|
|
247e6f39dc | ||
|
|
1e6dfc4481 | ||
|
|
7cc0750066 | ||
|
|
bd6585d3b4 | ||
|
|
717e828a1d | ||
|
|
07381d48e6 | ||
|
|
dd0ffaf312 | ||
|
|
264504affc | ||
|
|
4feedf2add | ||
|
|
2f76cf9831 | ||
|
|
1002d37f6b | ||
|
|
d260a94740 | ||
|
|
88c69b83ea | ||
|
|
2557ee2014 | ||
|
|
3c75deed80 | ||
|
|
d05343c927 | ||
|
|
e7972b7eaf | ||
|
|
75a091cc0d | ||
|
|
dca74803fd | ||
|
|
3cf3d868d0 | ||
|
|
bf4fc6604a | ||
|
|
e8c1eb86fa | ||
|
|
c3dad3cf69 | ||
|
|
811bd66088 |
4
.github/workflows/translate-strings.yml
vendored
4
.github/workflows/translate-strings.yml
vendored
@@ -12,9 +12,11 @@ jobs:
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
env:
|
||||
GH_REF: ${{ github.ref }} # sonar rule:githubactions:S7630 - avoid injection
|
||||
with:
|
||||
token: ${{ secrets.PNGX_BOT_PAT }}
|
||||
ref: ${{ github.head_ref }}
|
||||
ref: ${{ env.GH_REF }}
|
||||
- name: Set up Python
|
||||
id: setup-python
|
||||
uses: actions/setup-python@v6
|
||||
|
||||
@@ -294,13 +294,6 @@ The following methods are supported:
|
||||
- `"delete_original": true` to delete the original documents after editing.
|
||||
- `"update_document": true` to update the existing document with the edited PDF.
|
||||
- `"include_metadata": true` to copy metadata from the original document to the edited document.
|
||||
- `remove_password`
|
||||
- Requires `parameters`:
|
||||
- `"password": "PASSWORD_STRING"` The password to remove from the PDF documents.
|
||||
- Optional `parameters`:
|
||||
- `"update_document": true` to replace the existing document with the password-less PDF.
|
||||
- `"delete_original": true` to delete the original document after editing.
|
||||
- `"include_metadata": true` to copy metadata from the original document to the new password-less document.
|
||||
- `merge`
|
||||
- No additional `parameters` required.
|
||||
- The ordering of the merged document is determined by the list of IDs.
|
||||
|
||||
@@ -1804,3 +1804,23 @@ password. All of these options come from their similarly-named [Django settings]
|
||||
#### [`PAPERLESS_EMAIL_USE_SSL=<bool>`](#PAPERLESS_EMAIL_USE_SSL) {#PAPERLESS_EMAIL_USE_SSL}
|
||||
|
||||
: Defaults to false.
|
||||
|
||||
## Remote OCR
|
||||
|
||||
#### [`PAPERLESS_REMOTE_OCR_ENGINE=<str>`](#PAPERLESS_REMOTE_OCR_ENGINE) {#PAPERLESS_REMOTE_OCR_ENGINE}
|
||||
|
||||
: The remote OCR engine to use. Currently only Azure AI is supported as "azureai".
|
||||
|
||||
Defaults to None, which disables remote OCR.
|
||||
|
||||
#### [`PAPERLESS_REMOTE_OCR_API_KEY=<str>`](#PAPERLESS_REMOTE_OCR_API_KEY) {#PAPERLESS_REMOTE_OCR_API_KEY}
|
||||
|
||||
: The API key to use for the remote OCR engine.
|
||||
|
||||
Defaults to None.
|
||||
|
||||
#### [`PAPERLESS_REMOTE_OCR_ENDPOINT=<str>`](#PAPERLESS_REMOTE_OCR_ENDPOINT) {#PAPERLESS_REMOTE_OCR_ENDPOINT}
|
||||
|
||||
: The endpoint to use for the remote OCR engine. This is required for Azure AI.
|
||||
|
||||
Defaults to None.
|
||||
|
||||
@@ -25,9 +25,10 @@ physical documents into a searchable online archive so you can keep, well, _less
|
||||
## Features
|
||||
|
||||
- **Organize and index** your scanned documents with tags, correspondents, types, and more.
|
||||
- _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way.
|
||||
- _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way, unless you explicitly choose to do so.
|
||||
- Performs **OCR** on your documents, adding searchable and selectable text, even to documents scanned with only images.
|
||||
- Utilizes the open-source Tesseract engine to recognize more than 100 languages.
|
||||
- Utilizes the open-source Tesseract engine to recognize more than 100 languages.
|
||||
- _New!_ Supports remote OCR with Azure AI (opt-in).
|
||||
- Documents are saved as PDF/A format which is designed for long term storage, alongside the unaltered originals.
|
||||
- Uses machine-learning to automatically add tags, correspondents and document types to your documents.
|
||||
- Supports PDF documents, images, plain text files, Office documents (Word, Excel, PowerPoint, and LibreOffice equivalents)[^1] and more.
|
||||
|
||||
@@ -901,6 +901,21 @@ how regularly you intend to scan documents and use paperless.
|
||||
performed the task associated with the document, move it to the
|
||||
inbox.
|
||||
|
||||
## Remote OCR
|
||||
|
||||
!!! important
|
||||
|
||||
This feature is disabled by default and will always remain strictly "opt-in".
|
||||
|
||||
Paperless-ngx supports performing OCR on documents using remote services. At the moment, this is limited to
|
||||
[Microsoft's Azure "Document Intelligence" service](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence).
|
||||
This is of course a paid service (with a free tier) which requires an Azure account and subscription. Azure AI is not affiliated with
|
||||
Paperless-ngx in any way. When enabled, Paperless-ngx will automatically send appropriate documents to Azure for OCR processing, bypassing
|
||||
the local OCR engine. See the [configuration](configuration.md#PAPERLESS_REMOTE_OCR_ENGINE) options for more details.
|
||||
|
||||
Additionally, when using a commercial service with this feature, consider both potential costs as well as any associated file size
|
||||
or page limitations (e.g. with a free tier).
|
||||
|
||||
## Architecture
|
||||
|
||||
Paperless-ngx consists of the following components:
|
||||
|
||||
@@ -16,6 +16,7 @@ classifiers = [
|
||||
# This will allow testing to not install a webserver, mysql, etc
|
||||
|
||||
dependencies = [
|
||||
"azure-ai-documentintelligence>=1.0.2",
|
||||
"babel>=2.17",
|
||||
"bleach~=6.3.0",
|
||||
"celery[redis]~=5.5.1",
|
||||
@@ -253,6 +254,7 @@ testpaths = [
|
||||
"src/paperless_tesseract/tests/",
|
||||
"src/paperless_tika/tests",
|
||||
"src/paperless_text/tests/",
|
||||
"src/paperless_remote/tests/",
|
||||
]
|
||||
addopts = [
|
||||
"--pythonwarnings=all",
|
||||
|
||||
@@ -1,75 +0,0 @@
|
||||
<div class="modal-header">
|
||||
<h4 class="modal-title" id="modal-basic-title">{{title}}</h4>
|
||||
<button type="button" class="btn-close" aria-label="Close" (click)="cancel()">
|
||||
</button>
|
||||
</div>
|
||||
<div class="modal-body">
|
||||
@if (message) {
|
||||
<p class="mb-3" [innerHTML]="message"></p>
|
||||
}
|
||||
<div class="btn-group mb-3" role="group">
|
||||
<input
|
||||
type="radio"
|
||||
class="btn-check"
|
||||
name="passwordRemoveMode"
|
||||
id="removeReplace"
|
||||
[(ngModel)]="updateDocument"
|
||||
[value]="true"
|
||||
(ngModelChange)="onUpdateDocumentChange($event)"
|
||||
/>
|
||||
<label class="btn btn-outline-primary btn-sm" for="removeReplace">
|
||||
<i-bs name="pencil"></i-bs>
|
||||
<span class="ms-2" i18n>Replace current document</span>
|
||||
</label>
|
||||
<input
|
||||
type="radio"
|
||||
class="btn-check"
|
||||
name="passwordRemoveMode"
|
||||
id="removeCreate"
|
||||
[(ngModel)]="updateDocument"
|
||||
[value]="false"
|
||||
(ngModelChange)="onUpdateDocumentChange($event)"
|
||||
/>
|
||||
<label class="btn btn-outline-primary btn-sm" for="removeCreate">
|
||||
<i-bs name="plus"></i-bs>
|
||||
<span class="ms-2" i18n>Create new document</span>
|
||||
</label>
|
||||
</div>
|
||||
@if (!updateDocument) {
|
||||
<div class="d-flex flex-column flex-md-row w-100 gap-3 align-items-center">
|
||||
<div class="form-group d-flex">
|
||||
<div class="form-check">
|
||||
<input class="form-check-input" type="checkbox" id="copyMetaRemove" [(ngModel)]="includeMetadata" />
|
||||
<label class="form-check-label" for="copyMetaRemove" i18n> Copy metadata
|
||||
</label>
|
||||
</div>
|
||||
<div class="form-check ms-3">
|
||||
<input class="form-check-input" type="checkbox" id="deleteOriginalRemove" [(ngModel)]="deleteOriginal" />
|
||||
<label class="form-check-label" for="deleteOriginalRemove" i18n> Delete original</label>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
}
|
||||
</div>
|
||||
<div class="modal-footer flex-nowrap gap-2">
|
||||
<button
|
||||
type="button"
|
||||
class="btn"
|
||||
[class]="cancelBtnClass"
|
||||
(click)="cancel()"
|
||||
[disabled]="!buttonsEnabled"
|
||||
>
|
||||
<span class="d-inline-block" style="padding-bottom: 1px;">
|
||||
{{cancelBtnCaption}}
|
||||
</span>
|
||||
</button>
|
||||
<button
|
||||
type="button"
|
||||
class="btn"
|
||||
[class]="btnClass"
|
||||
(click)="confirm()"
|
||||
[disabled]="!confirmButtonEnabled || !buttonsEnabled"
|
||||
>
|
||||
{{btnCaption}}
|
||||
</button>
|
||||
</div>
|
||||
@@ -1,53 +0,0 @@
|
||||
import { ComponentFixture, TestBed } from '@angular/core/testing'
|
||||
import { By } from '@angular/platform-browser'
|
||||
import { NgbActiveModal } from '@ng-bootstrap/ng-bootstrap'
|
||||
import { NgxBootstrapIconsModule, allIcons } from 'ngx-bootstrap-icons'
|
||||
import { PasswordRemovalConfirmDialogComponent } from './password-removal-confirm-dialog.component'
|
||||
|
||||
describe('PasswordRemovalConfirmDialogComponent', () => {
|
||||
let component: PasswordRemovalConfirmDialogComponent
|
||||
let fixture: ComponentFixture<PasswordRemovalConfirmDialogComponent>
|
||||
|
||||
beforeEach(async () => {
|
||||
await TestBed.configureTestingModule({
|
||||
providers: [NgbActiveModal],
|
||||
imports: [
|
||||
NgxBootstrapIconsModule.pick(allIcons),
|
||||
PasswordRemovalConfirmDialogComponent,
|
||||
],
|
||||
}).compileComponents()
|
||||
|
||||
fixture = TestBed.createComponent(PasswordRemovalConfirmDialogComponent)
|
||||
component = fixture.componentInstance
|
||||
fixture.detectChanges()
|
||||
})
|
||||
|
||||
it('should default to replacing the document', () => {
|
||||
expect(component.updateDocument).toBe(true)
|
||||
expect(
|
||||
fixture.debugElement.query(By.css('#removeReplace')).nativeElement.checked
|
||||
).toBe(true)
|
||||
})
|
||||
|
||||
it('should allow creating a new document with metadata and delete toggle', () => {
|
||||
component.onUpdateDocumentChange(false)
|
||||
fixture.detectChanges()
|
||||
|
||||
expect(component.updateDocument).toBe(false)
|
||||
expect(fixture.debugElement.query(By.css('#copyMetaRemove'))).not.toBeNull()
|
||||
|
||||
component.includeMetadata = false
|
||||
component.deleteOriginal = true
|
||||
component.onUpdateDocumentChange(true)
|
||||
expect(component.updateDocument).toBe(true)
|
||||
expect(component.includeMetadata).toBe(true)
|
||||
expect(component.deleteOriginal).toBe(false)
|
||||
})
|
||||
|
||||
it('should emit confirm when confirmed', () => {
|
||||
let confirmed = false
|
||||
component.confirmClicked.subscribe(() => (confirmed = true))
|
||||
component.confirm()
|
||||
expect(confirmed).toBe(true)
|
||||
})
|
||||
})
|
||||
@@ -1,38 +0,0 @@
|
||||
import { Component, Input } from '@angular/core'
|
||||
import { FormsModule } from '@angular/forms'
|
||||
import { NgxBootstrapIconsModule } from 'ngx-bootstrap-icons'
|
||||
import { ConfirmDialogComponent } from '../confirm-dialog.component'
|
||||
|
||||
@Component({
|
||||
selector: 'pngx-password-removal-confirm-dialog',
|
||||
templateUrl: './password-removal-confirm-dialog.component.html',
|
||||
styleUrls: ['./password-removal-confirm-dialog.component.scss'],
|
||||
imports: [FormsModule, NgxBootstrapIconsModule],
|
||||
})
|
||||
export class PasswordRemovalConfirmDialogComponent extends ConfirmDialogComponent {
|
||||
updateDocument: boolean = true
|
||||
includeMetadata: boolean = true
|
||||
deleteOriginal: boolean = false
|
||||
|
||||
@Input()
|
||||
override title = $localize`Remove password protection`
|
||||
|
||||
@Input()
|
||||
override message =
|
||||
$localize`Create an unprotected copy or replace the existing file.`
|
||||
|
||||
@Input()
|
||||
override btnCaption = $localize`Start`
|
||||
|
||||
constructor() {
|
||||
super()
|
||||
}
|
||||
|
||||
onUpdateDocumentChange(updateDocument: boolean) {
|
||||
this.updateDocument = updateDocument
|
||||
if (this.updateDocument) {
|
||||
this.deleteOriginal = false
|
||||
this.includeMetadata = true
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -65,12 +65,6 @@
|
||||
<button ngbDropdownItem (click)="editPdf()" [disabled]="!userIsOwner || !userCanEdit || originalContentRenderType !== ContentRenderType.PDF">
|
||||
<i-bs name="pencil"></i-bs> <ng-container i18n>PDF Editor</ng-container>
|
||||
</button>
|
||||
|
||||
@if (userIsOwner && (requiresPassword || password)) {
|
||||
<button ngbDropdownItem (click)="removePassword()" [disabled]="!password">
|
||||
<i-bs name="unlock"></i-bs> <ng-container i18n>Remove Password</ng-container>
|
||||
</button>
|
||||
}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
@@ -66,7 +66,6 @@ import { SettingsService } from 'src/app/services/settings.service'
|
||||
import { ToastService } from 'src/app/services/toast.service'
|
||||
import { environment } from 'src/environments/environment'
|
||||
import { ConfirmDialogComponent } from '../common/confirm-dialog/confirm-dialog.component'
|
||||
import { PasswordRemovalConfirmDialogComponent } from '../common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component'
|
||||
import { CustomFieldsDropdownComponent } from '../common/custom-fields-dropdown/custom-fields-dropdown.component'
|
||||
import {
|
||||
DocumentDetailComponent,
|
||||
@@ -1210,88 +1209,6 @@ describe('DocumentDetailComponent', () => {
|
||||
expect(closeSpy).toHaveBeenCalled()
|
||||
})
|
||||
|
||||
it('should support removing password protection from pdfs', () => {
|
||||
let modal: NgbModalRef
|
||||
modalService.activeInstances.subscribe((m) => (modal = m[0]))
|
||||
initNormally()
|
||||
component.password = 'secret'
|
||||
component.removePassword()
|
||||
const dialog =
|
||||
modal.componentInstance as PasswordRemovalConfirmDialogComponent
|
||||
dialog.updateDocument = false
|
||||
dialog.includeMetadata = false
|
||||
dialog.deleteOriginal = true
|
||||
dialog.confirm()
|
||||
const req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/bulk_edit/`
|
||||
)
|
||||
expect(req.request.body).toEqual({
|
||||
documents: [doc.id],
|
||||
method: 'remove_password',
|
||||
parameters: {
|
||||
password: 'secret',
|
||||
update_document: false,
|
||||
include_metadata: false,
|
||||
delete_original: true,
|
||||
},
|
||||
})
|
||||
req.flush(true)
|
||||
})
|
||||
|
||||
it('should require the current password before removing it', () => {
|
||||
initNormally()
|
||||
const errorSpy = jest.spyOn(toastService, 'showError')
|
||||
component.requiresPassword = true
|
||||
component.password = ''
|
||||
|
||||
component.removePassword()
|
||||
|
||||
expect(errorSpy).toHaveBeenCalled()
|
||||
httpTestingController.expectNone(
|
||||
`${environment.apiBaseUrl}documents/bulk_edit/`
|
||||
)
|
||||
})
|
||||
|
||||
it('should handle failures when removing password protection', () => {
|
||||
let modal: NgbModalRef
|
||||
modalService.activeInstances.subscribe((m) => (modal = m[0]))
|
||||
initNormally()
|
||||
const errorSpy = jest.spyOn(toastService, 'showError')
|
||||
component.password = 'secret'
|
||||
|
||||
component.removePassword()
|
||||
const dialog =
|
||||
modal.componentInstance as PasswordRemovalConfirmDialogComponent
|
||||
dialog.confirm()
|
||||
const req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/bulk_edit/`
|
||||
)
|
||||
req.error(new ErrorEvent('failed'))
|
||||
|
||||
expect(errorSpy).toHaveBeenCalled()
|
||||
expect(component.networkActive).toBe(false)
|
||||
expect(dialog.buttonsEnabled).toBe(true)
|
||||
})
|
||||
|
||||
it('should refresh the document when removing password in update mode', () => {
|
||||
let modal: NgbModalRef
|
||||
modalService.activeInstances.subscribe((m) => (modal = m[0]))
|
||||
const refreshSpy = jest.spyOn(openDocumentsService, 'refreshDocument')
|
||||
initNormally()
|
||||
component.password = 'secret'
|
||||
|
||||
component.removePassword()
|
||||
const dialog =
|
||||
modal.componentInstance as PasswordRemovalConfirmDialogComponent
|
||||
dialog.confirm()
|
||||
const req = httpTestingController.expectOne(
|
||||
`${environment.apiBaseUrl}documents/bulk_edit/`
|
||||
)
|
||||
req.flush(true)
|
||||
|
||||
expect(refreshSpy).toHaveBeenCalledWith(doc.id)
|
||||
})
|
||||
|
||||
it('should support keyboard shortcuts', () => {
|
||||
initNormally()
|
||||
|
||||
|
||||
@@ -83,7 +83,6 @@ import { getFilenameFromContentDisposition } from 'src/app/utils/http'
|
||||
import { ISODateAdapter } from 'src/app/utils/ngb-iso-date-adapter'
|
||||
import * as UTIF from 'utif'
|
||||
import { ConfirmDialogComponent } from '../common/confirm-dialog/confirm-dialog.component'
|
||||
import { PasswordRemovalConfirmDialogComponent } from '../common/confirm-dialog/password-removal-confirm-dialog/password-removal-confirm-dialog.component'
|
||||
import { CustomFieldsDropdownComponent } from '../common/custom-fields-dropdown/custom-fields-dropdown.component'
|
||||
import { CorrespondentEditDialogComponent } from '../common/edit-dialog/correspondent-edit-dialog/correspondent-edit-dialog.component'
|
||||
import { DocumentTypeEditDialogComponent } from '../common/edit-dialog/document-type-edit-dialog/document-type-edit-dialog.component'
|
||||
@@ -176,7 +175,6 @@ export enum ZoomSetting {
|
||||
NgxBootstrapIconsModule,
|
||||
PdfViewerModule,
|
||||
TextAreaComponent,
|
||||
PasswordRemovalConfirmDialogComponent,
|
||||
],
|
||||
})
|
||||
export class DocumentDetailComponent
|
||||
@@ -1430,63 +1428,6 @@ export class DocumentDetailComponent
|
||||
})
|
||||
}
|
||||
|
||||
removePassword() {
|
||||
if (this.requiresPassword || !this.password) {
|
||||
this.toastService.showError(
|
||||
$localize`Please enter the current password before attempting to remove it.`
|
||||
)
|
||||
return
|
||||
}
|
||||
const modal = this.modalService.open(
|
||||
PasswordRemovalConfirmDialogComponent,
|
||||
{
|
||||
backdrop: 'static',
|
||||
}
|
||||
)
|
||||
modal.componentInstance.title = $localize`Remove password protection`
|
||||
modal.componentInstance.message = $localize`Create an unprotected copy or replace the existing file.`
|
||||
modal.componentInstance.btnCaption = $localize`Start`
|
||||
|
||||
modal.componentInstance.confirmClicked
|
||||
.pipe(takeUntil(this.unsubscribeNotifier))
|
||||
.subscribe(() => {
|
||||
const dialog =
|
||||
modal.componentInstance as PasswordRemovalConfirmDialogComponent
|
||||
dialog.buttonsEnabled = false
|
||||
this.networkActive = true
|
||||
this.documentsService
|
||||
.bulkEdit([this.document.id], 'remove_password', {
|
||||
password: this.password,
|
||||
update_document: dialog.updateDocument,
|
||||
include_metadata: dialog.includeMetadata,
|
||||
delete_original: dialog.deleteOriginal,
|
||||
})
|
||||
.pipe(first(), takeUntil(this.unsubscribeNotifier))
|
||||
.subscribe({
|
||||
next: () => {
|
||||
this.toastService.showInfo(
|
||||
$localize`Password removal operation for "${this.document.title}" will begin in the background.`
|
||||
)
|
||||
this.networkActive = false
|
||||
modal.close()
|
||||
if (!dialog.updateDocument && dialog.deleteOriginal) {
|
||||
this.openDocumentService.closeDocument(this.document)
|
||||
} else if (dialog.updateDocument) {
|
||||
this.openDocumentService.refreshDocument(this.documentId)
|
||||
}
|
||||
},
|
||||
error: (error) => {
|
||||
dialog.buttonsEnabled = true
|
||||
this.networkActive = false
|
||||
this.toastService.showError(
|
||||
$localize`Error executing password removal operation`,
|
||||
error
|
||||
)
|
||||
},
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
printDocument() {
|
||||
const printUrl = this.documentsService.getDownloadUrl(
|
||||
this.document.id,
|
||||
|
||||
@@ -132,7 +132,6 @@ import {
|
||||
threeDotsVertical,
|
||||
trash,
|
||||
uiRadios,
|
||||
unlock,
|
||||
upcScan,
|
||||
windowStack,
|
||||
x,
|
||||
@@ -349,7 +348,6 @@ const icons = {
|
||||
threeDotsVertical,
|
||||
trash,
|
||||
uiRadios,
|
||||
unlock,
|
||||
upcScan,
|
||||
windowStack,
|
||||
x,
|
||||
|
||||
@@ -646,77 +646,6 @@ def edit_pdf(
|
||||
return "OK"
|
||||
|
||||
|
||||
def remove_password(
|
||||
doc_ids: list[int],
|
||||
password: str,
|
||||
*,
|
||||
update_document: bool = False,
|
||||
delete_original: bool = False,
|
||||
include_metadata: bool = True,
|
||||
user: User | None = None,
|
||||
) -> Literal["OK"]:
|
||||
"""
|
||||
Remove password protection from PDF documents.
|
||||
"""
|
||||
import pikepdf
|
||||
|
||||
for doc_id in doc_ids:
|
||||
doc = Document.objects.get(id=doc_id)
|
||||
try:
|
||||
logger.info(
|
||||
f"Attempting password removal from document {doc_ids[0]}",
|
||||
)
|
||||
with pikepdf.open(doc.source_path, password=password) as pdf:
|
||||
temp_path = doc.source_path.with_suffix(".tmp.pdf")
|
||||
pdf.remove_unreferenced_resources()
|
||||
pdf.save(temp_path)
|
||||
|
||||
if update_document:
|
||||
# replace the original document with the unprotected one
|
||||
temp_path.replace(doc.source_path)
|
||||
doc.checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
|
||||
doc.page_count = len(pdf.pages)
|
||||
doc.save()
|
||||
update_document_content_maybe_archive_file.delay(document_id=doc.id)
|
||||
else:
|
||||
consume_tasks = []
|
||||
overrides = (
|
||||
DocumentMetadataOverrides().from_document(doc)
|
||||
if include_metadata
|
||||
else DocumentMetadataOverrides()
|
||||
)
|
||||
if user is not None:
|
||||
overrides.owner_id = user.id
|
||||
|
||||
filepath: Path = (
|
||||
Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR))
|
||||
/ f"{doc.id}_unprotected.pdf"
|
||||
)
|
||||
temp_path.replace(filepath)
|
||||
consume_tasks.append(
|
||||
consume_file.s(
|
||||
ConsumableDocument(
|
||||
source=DocumentSource.ConsumeFolder,
|
||||
original_file=filepath,
|
||||
),
|
||||
overrides,
|
||||
),
|
||||
)
|
||||
|
||||
if delete_original:
|
||||
chord(header=consume_tasks, body=delete.si([doc.id])).delay()
|
||||
else:
|
||||
group(consume_tasks).delay()
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Error removing password from document {doc.id}: {e}")
|
||||
raise ValueError(
|
||||
f"An error occurred while removing the password: {e}",
|
||||
) from e
|
||||
|
||||
return "OK"
|
||||
|
||||
|
||||
def reflect_doclinks(
|
||||
document: Document,
|
||||
field: CustomField,
|
||||
|
||||
@@ -18,6 +18,8 @@ from django.core.exceptions import ValidationError
|
||||
from django.core.validators import DecimalValidator
|
||||
from django.core.validators import EmailValidator
|
||||
from django.core.validators import MaxLengthValidator
|
||||
from django.core.validators import MaxValueValidator
|
||||
from django.core.validators import MinValueValidator
|
||||
from django.core.validators import RegexValidator
|
||||
from django.core.validators import integer_validator
|
||||
from django.db.models import Count
|
||||
@@ -875,6 +877,13 @@ class CustomFieldInstanceSerializer(serializers.ModelSerializer):
|
||||
uri_validator(data["value"])
|
||||
elif field.data_type == CustomField.FieldDataType.INT:
|
||||
integer_validator(data["value"])
|
||||
try:
|
||||
value_int = int(data["value"])
|
||||
except (TypeError, ValueError):
|
||||
raise serializers.ValidationError("Enter a valid integer.")
|
||||
# Keep values within the PostgreSQL integer range
|
||||
MinValueValidator(-2147483648)(value_int)
|
||||
MaxValueValidator(2147483647)(value_int)
|
||||
elif (
|
||||
field.data_type == CustomField.FieldDataType.MONETARY
|
||||
and data["value"] != ""
|
||||
@@ -1421,7 +1430,6 @@ class BulkEditSerializer(
|
||||
"split",
|
||||
"delete_pages",
|
||||
"edit_pdf",
|
||||
"remove_password",
|
||||
],
|
||||
label="Method",
|
||||
write_only=True,
|
||||
@@ -1497,8 +1505,6 @@ class BulkEditSerializer(
|
||||
return bulk_edit.delete_pages
|
||||
elif method == "edit_pdf":
|
||||
return bulk_edit.edit_pdf
|
||||
elif method == "remove_password":
|
||||
return bulk_edit.remove_password
|
||||
else: # pragma: no cover
|
||||
# This will never happen as it is handled by the ChoiceField
|
||||
raise serializers.ValidationError("Unsupported method.")
|
||||
@@ -1695,12 +1701,6 @@ class BulkEditSerializer(
|
||||
f"Page {op['page']} is out of bounds for document with {doc.page_count} pages.",
|
||||
)
|
||||
|
||||
def validate_parameters_remove_password(self, parameters):
|
||||
if "password" not in parameters:
|
||||
raise serializers.ValidationError("password not specified")
|
||||
if not isinstance(parameters["password"], str):
|
||||
raise serializers.ValidationError("password must be a string")
|
||||
|
||||
def validate(self, attrs):
|
||||
method = attrs["method"]
|
||||
parameters = attrs["parameters"]
|
||||
@@ -1741,8 +1741,6 @@ class BulkEditSerializer(
|
||||
"Edit PDF method only supports one document",
|
||||
)
|
||||
self._validate_parameters_edit_pdf(parameters, attrs["documents"][0])
|
||||
elif method == bulk_edit.remove_password:
|
||||
self.validate_parameters_remove_password(parameters)
|
||||
|
||||
return attrs
|
||||
|
||||
|
||||
@@ -1582,58 +1582,6 @@ class TestBulkEditAPI(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
|
||||
self.assertIn(b"out of bounds", response.content)
|
||||
|
||||
@mock.patch("documents.serialisers.bulk_edit.remove_password")
|
||||
def test_remove_password(self, m):
|
||||
self.setup_mock(m, "remove_password")
|
||||
response = self.client.post(
|
||||
"/api/documents/bulk_edit/",
|
||||
json.dumps(
|
||||
{
|
||||
"documents": [self.doc2.id],
|
||||
"method": "remove_password",
|
||||
"parameters": {"password": "secret", "update_document": True},
|
||||
},
|
||||
),
|
||||
content_type="application/json",
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
m.assert_called_once()
|
||||
args, kwargs = m.call_args
|
||||
self.assertCountEqual(args[0], [self.doc2.id])
|
||||
self.assertEqual(kwargs["password"], "secret")
|
||||
self.assertTrue(kwargs["update_document"])
|
||||
self.assertEqual(kwargs["user"], self.user)
|
||||
|
||||
def test_remove_password_invalid_params(self):
|
||||
response = self.client.post(
|
||||
"/api/documents/bulk_edit/",
|
||||
json.dumps(
|
||||
{
|
||||
"documents": [self.doc2.id],
|
||||
"method": "remove_password",
|
||||
"parameters": {},
|
||||
},
|
||||
),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
|
||||
self.assertIn(b"password not specified", response.content)
|
||||
|
||||
response = self.client.post(
|
||||
"/api/documents/bulk_edit/",
|
||||
json.dumps(
|
||||
{
|
||||
"documents": [self.doc2.id],
|
||||
"method": "remove_password",
|
||||
"parameters": {"password": 123},
|
||||
},
|
||||
),
|
||||
content_type="application/json",
|
||||
)
|
||||
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
|
||||
self.assertIn(b"password must be a string", response.content)
|
||||
|
||||
@override_settings(AUDIT_LOG_ENABLED=True)
|
||||
def test_bulk_edit_audit_log_enabled_simple_field(self):
|
||||
"""
|
||||
|
||||
@@ -1664,6 +1664,44 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
|
||||
|
||||
self.consume_file_mock.assert_not_called()
|
||||
|
||||
def test_patch_document_integer_custom_field_out_of_range(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- An integer custom field
|
||||
- A document
|
||||
WHEN:
|
||||
- Patching the document with an integer value exceeding PostgreSQL's range
|
||||
THEN:
|
||||
- HTTP 400 is returned (validation catches the overflow)
|
||||
- No custom field instance is created
|
||||
"""
|
||||
cf_int = CustomField.objects.create(
|
||||
name="intfield",
|
||||
data_type=CustomField.FieldDataType.INT,
|
||||
)
|
||||
doc = Document.objects.create(
|
||||
title="Doc",
|
||||
checksum="123",
|
||||
mime_type="application/pdf",
|
||||
)
|
||||
|
||||
response = self.client.patch(
|
||||
f"/api/documents/{doc.pk}/",
|
||||
{
|
||||
"custom_fields": [
|
||||
{
|
||||
"field": cf_int.pk,
|
||||
"value": 2**31, # overflow for PostgreSQL integer fields
|
||||
},
|
||||
],
|
||||
},
|
||||
format="json",
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
|
||||
self.assertIn("custom_fields", response.data)
|
||||
self.assertEqual(CustomFieldInstance.objects.count(), 0)
|
||||
|
||||
def test_upload_with_webui_source(self):
|
||||
"""
|
||||
GIVEN: A document with a source file
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import hashlib
|
||||
import shutil
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
@@ -1067,147 +1066,3 @@ class TestPDFActions(DirectoriesMixin, TestCase):
|
||||
bulk_edit.edit_pdf(doc_ids, operations, update_document=True)
|
||||
mock_group.assert_not_called()
|
||||
mock_consume_file.assert_not_called()
|
||||
|
||||
@mock.patch("documents.bulk_edit.update_document_content_maybe_archive_file.delay")
|
||||
@mock.patch("pikepdf.open")
|
||||
def test_remove_password_update_document(self, mock_open, mock_update_document):
|
||||
doc = self.doc1
|
||||
original_checksum = doc.checksum
|
||||
|
||||
fake_pdf = mock.MagicMock()
|
||||
fake_pdf.pages = [mock.Mock(), mock.Mock(), mock.Mock()]
|
||||
|
||||
def save_side_effect(target_path):
|
||||
Path(target_path).write_bytes(b"new pdf content")
|
||||
|
||||
fake_pdf.save.side_effect = save_side_effect
|
||||
mock_open.return_value.__enter__.return_value = fake_pdf
|
||||
|
||||
result = bulk_edit.remove_password(
|
||||
[doc.id],
|
||||
password="secret",
|
||||
update_document=True,
|
||||
)
|
||||
|
||||
self.assertEqual(result, "OK")
|
||||
mock_open.assert_called_once_with(doc.source_path, password="secret")
|
||||
fake_pdf.remove_unreferenced_resources.assert_called_once()
|
||||
doc.refresh_from_db()
|
||||
self.assertNotEqual(doc.checksum, original_checksum)
|
||||
expected_checksum = hashlib.md5(doc.source_path.read_bytes()).hexdigest()
|
||||
self.assertEqual(doc.checksum, expected_checksum)
|
||||
self.assertEqual(doc.page_count, len(fake_pdf.pages))
|
||||
mock_update_document.assert_called_once_with(document_id=doc.id)
|
||||
|
||||
@mock.patch("documents.bulk_edit.chord")
|
||||
@mock.patch("documents.bulk_edit.group")
|
||||
@mock.patch("documents.tasks.consume_file.s")
|
||||
@mock.patch("documents.bulk_edit.tempfile.mkdtemp")
|
||||
@mock.patch("pikepdf.open")
|
||||
def test_remove_password_creates_consumable_document(
|
||||
self,
|
||||
mock_open,
|
||||
mock_mkdtemp,
|
||||
mock_consume_file,
|
||||
mock_group,
|
||||
mock_chord,
|
||||
):
|
||||
doc = self.doc2
|
||||
temp_dir = self.dirs.scratch_dir / "remove-password"
|
||||
temp_dir.mkdir(parents=True, exist_ok=True)
|
||||
mock_mkdtemp.return_value = str(temp_dir)
|
||||
|
||||
fake_pdf = mock.MagicMock()
|
||||
fake_pdf.pages = [mock.Mock(), mock.Mock()]
|
||||
|
||||
def save_side_effect(target_path):
|
||||
Path(target_path).write_bytes(b"password removed")
|
||||
|
||||
fake_pdf.save.side_effect = save_side_effect
|
||||
mock_open.return_value.__enter__.return_value = fake_pdf
|
||||
mock_group.return_value.delay.return_value = None
|
||||
|
||||
user = User.objects.create(username="owner")
|
||||
|
||||
result = bulk_edit.remove_password(
|
||||
[doc.id],
|
||||
password="secret",
|
||||
include_metadata=False,
|
||||
update_document=False,
|
||||
delete_original=False,
|
||||
user=user,
|
||||
)
|
||||
|
||||
self.assertEqual(result, "OK")
|
||||
mock_open.assert_called_once_with(doc.source_path, password="secret")
|
||||
mock_consume_file.assert_called_once()
|
||||
consume_args, _ = mock_consume_file.call_args
|
||||
consumable_document = consume_args[0]
|
||||
overrides = consume_args[1]
|
||||
expected_path = temp_dir / f"{doc.id}_unprotected.pdf"
|
||||
self.assertTrue(expected_path.exists())
|
||||
self.assertEqual(
|
||||
Path(consumable_document.original_file).resolve(),
|
||||
expected_path.resolve(),
|
||||
)
|
||||
self.assertEqual(overrides.owner_id, user.id)
|
||||
mock_group.assert_called_once_with([mock_consume_file.return_value])
|
||||
mock_group.return_value.delay.assert_called_once()
|
||||
mock_chord.assert_not_called()
|
||||
|
||||
@mock.patch("documents.bulk_edit.delete")
|
||||
@mock.patch("documents.bulk_edit.chord")
|
||||
@mock.patch("documents.bulk_edit.group")
|
||||
@mock.patch("documents.tasks.consume_file.s")
|
||||
@mock.patch("documents.bulk_edit.tempfile.mkdtemp")
|
||||
@mock.patch("pikepdf.open")
|
||||
def test_remove_password_deletes_original(
|
||||
self,
|
||||
mock_open,
|
||||
mock_mkdtemp,
|
||||
mock_consume_file,
|
||||
mock_group,
|
||||
mock_chord,
|
||||
mock_delete,
|
||||
):
|
||||
doc = self.doc2
|
||||
temp_dir = self.dirs.scratch_dir / "remove-password-delete"
|
||||
temp_dir.mkdir(parents=True, exist_ok=True)
|
||||
mock_mkdtemp.return_value = str(temp_dir)
|
||||
|
||||
fake_pdf = mock.MagicMock()
|
||||
fake_pdf.pages = [mock.Mock(), mock.Mock()]
|
||||
|
||||
def save_side_effect(target_path):
|
||||
Path(target_path).write_bytes(b"password removed")
|
||||
|
||||
fake_pdf.save.side_effect = save_side_effect
|
||||
mock_open.return_value.__enter__.return_value = fake_pdf
|
||||
mock_chord.return_value.delay.return_value = None
|
||||
|
||||
result = bulk_edit.remove_password(
|
||||
[doc.id],
|
||||
password="secret",
|
||||
include_metadata=False,
|
||||
update_document=False,
|
||||
delete_original=True,
|
||||
)
|
||||
|
||||
self.assertEqual(result, "OK")
|
||||
mock_open.assert_called_once_with(doc.source_path, password="secret")
|
||||
mock_consume_file.assert_called_once()
|
||||
mock_group.assert_not_called()
|
||||
mock_chord.assert_called_once()
|
||||
mock_chord.return_value.delay.assert_called_once()
|
||||
mock_delete.si.assert_called_once_with([doc.id])
|
||||
|
||||
@mock.patch("pikepdf.open")
|
||||
def test_remove_password_open_failure(self, mock_open):
|
||||
mock_open.side_effect = RuntimeError("wrong password")
|
||||
|
||||
with self.assertLogs("paperless.bulk_edit", level="ERROR") as cm:
|
||||
with self.assertRaises(ValueError) as exc:
|
||||
bulk_edit.remove_password([self.doc1.id], password="secret")
|
||||
|
||||
self.assertIn("wrong password", str(exc.exception))
|
||||
self.assertIn("Error removing password from document", cm.output[0])
|
||||
|
||||
@@ -1504,7 +1504,6 @@ class BulkEditView(PassUserMixin):
|
||||
"merge": None,
|
||||
"edit_pdf": "checksum",
|
||||
"reprocess": "checksum",
|
||||
"remove_password": "checksum",
|
||||
}
|
||||
|
||||
permission_classes = (IsAuthenticated,)
|
||||
@@ -1523,7 +1522,6 @@ class BulkEditView(PassUserMixin):
|
||||
bulk_edit.split,
|
||||
bulk_edit.merge,
|
||||
bulk_edit.edit_pdf,
|
||||
bulk_edit.remove_password,
|
||||
]:
|
||||
parameters["user"] = user
|
||||
|
||||
@@ -1552,7 +1550,6 @@ class BulkEditView(PassUserMixin):
|
||||
bulk_edit.rotate,
|
||||
bulk_edit.delete_pages,
|
||||
bulk_edit.edit_pdf,
|
||||
bulk_edit.remove_password,
|
||||
]
|
||||
)
|
||||
or (
|
||||
@@ -1569,7 +1566,7 @@ class BulkEditView(PassUserMixin):
|
||||
and (
|
||||
method in [bulk_edit.split, bulk_edit.merge]
|
||||
or (
|
||||
method in [bulk_edit.edit_pdf, bulk_edit.remove_password]
|
||||
method == bulk_edit.edit_pdf
|
||||
and not parameters["update_document"]
|
||||
)
|
||||
)
|
||||
|
||||
@@ -2,7 +2,7 @@ msgid ""
|
||||
msgstr ""
|
||||
"Project-Id-Version: paperless-ngx\n"
|
||||
"Report-Msgid-Bugs-To: \n"
|
||||
"POT-Creation-Date: 2025-12-24 05:27+0000\n"
|
||||
"POT-Creation-Date: 2025-12-29 14:49+0000\n"
|
||||
"PO-Revision-Date: 2022-02-17 04:17\n"
|
||||
"Last-Translator: \n"
|
||||
"Language-Team: English\n"
|
||||
@@ -1219,35 +1219,35 @@ msgstr ""
|
||||
msgid "workflow runs"
|
||||
msgstr ""
|
||||
|
||||
#: documents/serialisers.py:640
|
||||
#: documents/serialisers.py:642
|
||||
msgid "Invalid color."
|
||||
msgstr ""
|
||||
|
||||
#: documents/serialisers.py:1826
|
||||
#: documents/serialisers.py:1835
|
||||
#, python-format
|
||||
msgid "File type %(type)s not supported"
|
||||
msgstr ""
|
||||
|
||||
#: documents/serialisers.py:1870
|
||||
#: documents/serialisers.py:1879
|
||||
#, python-format
|
||||
msgid "Custom field id must be an integer: %(id)s"
|
||||
msgstr ""
|
||||
|
||||
#: documents/serialisers.py:1877
|
||||
#: documents/serialisers.py:1886
|
||||
#, python-format
|
||||
msgid "Custom field with id %(id)s does not exist"
|
||||
msgstr ""
|
||||
|
||||
#: documents/serialisers.py:1894 documents/serialisers.py:1904
|
||||
#: documents/serialisers.py:1903 documents/serialisers.py:1913
|
||||
msgid ""
|
||||
"Custom fields must be a list of integers or an object mapping ids to values."
|
||||
msgstr ""
|
||||
|
||||
#: documents/serialisers.py:1899
|
||||
#: documents/serialisers.py:1908
|
||||
msgid "Some custom fields don't exist or were specified twice."
|
||||
msgstr ""
|
||||
|
||||
#: documents/serialisers.py:2014
|
||||
#: documents/serialisers.py:2023
|
||||
msgid "Invalid variable detected."
|
||||
msgstr ""
|
||||
|
||||
|
||||
@@ -322,6 +322,7 @@ INSTALLED_APPS = [
|
||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||
"paperless_text.apps.PaperlessTextConfig",
|
||||
"paperless_mail.apps.PaperlessMailConfig",
|
||||
"paperless_remote.apps.PaperlessRemoteParserConfig",
|
||||
"django.contrib.admin",
|
||||
"rest_framework",
|
||||
"rest_framework.authtoken",
|
||||
@@ -1402,3 +1403,10 @@ WEBHOOKS_ALLOW_INTERNAL_REQUESTS = __get_boolean(
|
||||
"PAPERLESS_WEBHOOKS_ALLOW_INTERNAL_REQUESTS",
|
||||
"true",
|
||||
)
|
||||
|
||||
###############################################################################
|
||||
# Remote Parser #
|
||||
###############################################################################
|
||||
REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
|
||||
REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
|
||||
REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")
|
||||
|
||||
4
src/paperless_remote/__init__.py
Normal file
4
src/paperless_remote/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# this is here so that django finds the checks.
|
||||
from paperless_remote.checks import check_remote_parser_configured
|
||||
|
||||
__all__ = ["check_remote_parser_configured"]
|
||||
14
src/paperless_remote/apps.py
Normal file
14
src/paperless_remote/apps.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
from paperless_remote.signals import remote_consumer_declaration
|
||||
|
||||
|
||||
class PaperlessRemoteParserConfig(AppConfig):
|
||||
name = "paperless_remote"
|
||||
|
||||
def ready(self):
|
||||
from documents.signals import document_consumer_declaration
|
||||
|
||||
document_consumer_declaration.connect(remote_consumer_declaration)
|
||||
|
||||
AppConfig.ready(self)
|
||||
17
src/paperless_remote/checks.py
Normal file
17
src/paperless_remote/checks.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from django.conf import settings
|
||||
from django.core.checks import Error
|
||||
from django.core.checks import register
|
||||
|
||||
|
||||
@register()
|
||||
def check_remote_parser_configured(app_configs, **kwargs):
|
||||
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
|
||||
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
|
||||
):
|
||||
return [
|
||||
Error(
|
||||
"Azure AI remote parser requires endpoint and API key to be configured.",
|
||||
),
|
||||
]
|
||||
|
||||
return []
|
||||
118
src/paperless_remote/parsers.py
Normal file
118
src/paperless_remote/parsers.py
Normal file
@@ -0,0 +1,118 @@
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
||||
|
||||
|
||||
class RemoteEngineConfig:
|
||||
def __init__(
|
||||
self,
|
||||
engine: str,
|
||||
api_key: str | None = None,
|
||||
endpoint: str | None = None,
|
||||
):
|
||||
self.engine = engine
|
||||
self.api_key = api_key
|
||||
self.endpoint = endpoint
|
||||
|
||||
def engine_is_valid(self):
|
||||
valid = self.engine in ["azureai"] and self.api_key is not None
|
||||
if self.engine == "azureai":
|
||||
valid = valid and self.endpoint is not None
|
||||
return valid
|
||||
|
||||
|
||||
class RemoteDocumentParser(RasterisedDocumentParser):
|
||||
"""
|
||||
This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
|
||||
as this is the only service that provides a remote OCR API with text-embedded PDF output.
|
||||
"""
|
||||
|
||||
logging_name = "paperless.parsing.remote"
|
||||
|
||||
def get_settings(self) -> RemoteEngineConfig:
|
||||
"""
|
||||
Returns the configuration for the remote OCR engine, loaded from Django settings.
|
||||
"""
|
||||
return RemoteEngineConfig(
|
||||
engine=settings.REMOTE_OCR_ENGINE,
|
||||
api_key=settings.REMOTE_OCR_API_KEY,
|
||||
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
||||
)
|
||||
|
||||
def supported_mime_types(self):
|
||||
if self.settings.engine_is_valid():
|
||||
return {
|
||||
"application/pdf": ".pdf",
|
||||
"image/png": ".png",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/tiff": ".tiff",
|
||||
"image/bmp": ".bmp",
|
||||
"image/gif": ".gif",
|
||||
"image/webp": ".webp",
|
||||
}
|
||||
else:
|
||||
return {}
|
||||
|
||||
def azure_ai_vision_parse(
|
||||
self,
|
||||
file: Path,
|
||||
) -> str | None:
|
||||
"""
|
||||
Uses Azure AI Vision to parse the document and return the text content.
|
||||
It requests a searchable PDF output with embedded text.
|
||||
The PDF is saved to the archive_path attribute.
|
||||
Returns the text content extracted from the document.
|
||||
If the parsing fails, it returns None.
|
||||
"""
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
||||
from azure.ai.documentintelligence.models import AnalyzeOutputOption
|
||||
from azure.ai.documentintelligence.models import DocumentContentFormat
|
||||
from azure.core.credentials import AzureKeyCredential
|
||||
|
||||
client = DocumentIntelligenceClient(
|
||||
endpoint=self.settings.endpoint,
|
||||
credential=AzureKeyCredential(self.settings.api_key),
|
||||
)
|
||||
|
||||
try:
|
||||
with file.open("rb") as f:
|
||||
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
|
||||
poller = client.begin_analyze_document(
|
||||
model_id="prebuilt-read",
|
||||
body=analyze_request,
|
||||
output_content_format=DocumentContentFormat.TEXT,
|
||||
output=[AnalyzeOutputOption.PDF], # request searchable PDF output
|
||||
content_type="application/json",
|
||||
)
|
||||
|
||||
poller.wait()
|
||||
result_id = poller.details["operation_id"]
|
||||
result = poller.result()
|
||||
|
||||
# Download the PDF with embedded text
|
||||
self.archive_path = self.tempdir / "archive.pdf"
|
||||
with self.archive_path.open("wb") as f:
|
||||
for chunk in client.get_analyze_result_pdf(
|
||||
model_id="prebuilt-read",
|
||||
result_id=result_id,
|
||||
):
|
||||
f.write(chunk)
|
||||
return result.content
|
||||
except Exception as e:
|
||||
self.log.error(f"Azure AI Vision parsing failed: {e}")
|
||||
finally:
|
||||
client.close()
|
||||
|
||||
return None
|
||||
|
||||
def parse(self, document_path: Path, mime_type, file_name=None):
|
||||
if not self.settings.engine_is_valid():
|
||||
self.log.warning(
|
||||
"No valid remote parser engine is configured, content will be empty.",
|
||||
)
|
||||
self.text = ""
|
||||
elif self.settings.engine == "azureai":
|
||||
self.text = self.azure_ai_vision_parse(document_path)
|
||||
18
src/paperless_remote/signals.py
Normal file
18
src/paperless_remote/signals.py
Normal file
@@ -0,0 +1,18 @@
|
||||
def get_parser(*args, **kwargs):
|
||||
from paperless_remote.parsers import RemoteDocumentParser
|
||||
|
||||
return RemoteDocumentParser(*args, **kwargs)
|
||||
|
||||
|
||||
def get_supported_mime_types():
|
||||
from paperless_remote.parsers import RemoteDocumentParser
|
||||
|
||||
return RemoteDocumentParser(None).supported_mime_types()
|
||||
|
||||
|
||||
def remote_consumer_declaration(sender, **kwargs):
|
||||
return {
|
||||
"parser": get_parser,
|
||||
"weight": 5,
|
||||
"mime_types": get_supported_mime_types(),
|
||||
}
|
||||
BIN
src/paperless_remote/tests/samples/simple-digital.pdf
Normal file
BIN
src/paperless_remote/tests/samples/simple-digital.pdf
Normal file
Binary file not shown.
24
src/paperless_remote/tests/test_checks.py
Normal file
24
src/paperless_remote/tests/test_checks.py
Normal file
@@ -0,0 +1,24 @@
|
||||
from unittest import TestCase
|
||||
|
||||
from django.test import override_settings
|
||||
|
||||
from paperless_remote import check_remote_parser_configured
|
||||
|
||||
|
||||
class TestChecks(TestCase):
|
||||
@override_settings(REMOTE_OCR_ENGINE=None)
|
||||
def test_no_engine(self):
|
||||
msgs = check_remote_parser_configured(None)
|
||||
self.assertEqual(len(msgs), 0)
|
||||
|
||||
@override_settings(REMOTE_OCR_ENGINE="azureai")
|
||||
@override_settings(REMOTE_OCR_API_KEY="somekey")
|
||||
@override_settings(REMOTE_OCR_ENDPOINT=None)
|
||||
def test_azure_no_endpoint(self):
|
||||
msgs = check_remote_parser_configured(None)
|
||||
self.assertEqual(len(msgs), 1)
|
||||
self.assertTrue(
|
||||
msgs[0].msg.startswith(
|
||||
"Azure AI remote parser requires endpoint and API key to be configured.",
|
||||
),
|
||||
)
|
||||
128
src/paperless_remote/tests/test_parser.py
Normal file
128
src/paperless_remote/tests/test_parser.py
Normal file
@@ -0,0 +1,128 @@
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.tests.utils import FileSystemAssertsMixin
|
||||
from paperless_remote.parsers import RemoteDocumentParser
|
||||
from paperless_remote.signals import get_parser
|
||||
|
||||
|
||||
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
|
||||
|
||||
def assertContainsStrings(self, content: str, strings: list[str]):
|
||||
# Asserts that all strings appear in content, in the given order.
|
||||
indices = []
|
||||
for s in strings:
|
||||
if s in content:
|
||||
indices.append(content.index(s))
|
||||
else:
|
||||
self.fail(f"'{s}' is not in '{content}'")
|
||||
self.assertListEqual(indices, sorted(indices))
|
||||
|
||||
@mock.patch("paperless_tesseract.parsers.run_subprocess")
|
||||
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
|
||||
def test_get_text_with_azure(self, mock_client_cls, mock_subprocess):
|
||||
# Arrange mock Azure client
|
||||
mock_client = mock.Mock()
|
||||
mock_client_cls.return_value = mock_client
|
||||
|
||||
# Simulate poller result and its `.details`
|
||||
mock_poller = mock.Mock()
|
||||
mock_poller.wait.return_value = None
|
||||
mock_poller.details = {"operation_id": "fake-op-id"}
|
||||
mock_client.begin_analyze_document.return_value = mock_poller
|
||||
mock_poller.result.return_value.content = "This is a test document."
|
||||
|
||||
# Return dummy PDF bytes
|
||||
mock_client.get_analyze_result_pdf.return_value = [
|
||||
b"%PDF-",
|
||||
b"1.7 ",
|
||||
b"FAKEPDF",
|
||||
]
|
||||
|
||||
# Simulate pdftotext by writing dummy text to sidecar file
|
||||
def fake_run(cmd, *args, **kwargs):
|
||||
with Path(cmd[-1]).open("w", encoding="utf-8") as f:
|
||||
f.write("This is a test document.")
|
||||
|
||||
mock_subprocess.side_effect = fake_run
|
||||
|
||||
with override_settings(
|
||||
REMOTE_OCR_ENGINE="azureai",
|
||||
REMOTE_OCR_API_KEY="somekey",
|
||||
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
|
||||
):
|
||||
parser = get_parser(uuid.uuid4())
|
||||
parser.parse(
|
||||
self.SAMPLE_FILES / "simple-digital.pdf",
|
||||
"application/pdf",
|
||||
)
|
||||
|
||||
self.assertContainsStrings(
|
||||
parser.text.strip(),
|
||||
["This is a test document."],
|
||||
)
|
||||
|
||||
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
|
||||
def test_get_text_with_azure_error_logged_and_returns_none(self, mock_client_cls):
|
||||
mock_client = mock.Mock()
|
||||
mock_client.begin_analyze_document.side_effect = RuntimeError("fail")
|
||||
mock_client_cls.return_value = mock_client
|
||||
|
||||
with override_settings(
|
||||
REMOTE_OCR_ENGINE="azureai",
|
||||
REMOTE_OCR_API_KEY="somekey",
|
||||
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
|
||||
):
|
||||
parser = get_parser(uuid.uuid4())
|
||||
with mock.patch.object(parser.log, "error") as mock_log_error:
|
||||
parser.parse(
|
||||
self.SAMPLE_FILES / "simple-digital.pdf",
|
||||
"application/pdf",
|
||||
)
|
||||
|
||||
self.assertIsNone(parser.text)
|
||||
mock_client.begin_analyze_document.assert_called_once()
|
||||
mock_client.close.assert_called_once()
|
||||
mock_log_error.assert_called_once()
|
||||
self.assertIn(
|
||||
"Azure AI Vision parsing failed",
|
||||
mock_log_error.call_args[0][0],
|
||||
)
|
||||
|
||||
@override_settings(
|
||||
REMOTE_OCR_ENGINE="azureai",
|
||||
REMOTE_OCR_API_KEY="key",
|
||||
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
|
||||
)
|
||||
def test_supported_mime_types_valid_config(self):
|
||||
parser = RemoteDocumentParser(uuid.uuid4())
|
||||
expected_types = {
|
||||
"application/pdf": ".pdf",
|
||||
"image/png": ".png",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/tiff": ".tiff",
|
||||
"image/bmp": ".bmp",
|
||||
"image/gif": ".gif",
|
||||
"image/webp": ".webp",
|
||||
}
|
||||
self.assertEqual(parser.supported_mime_types(), expected_types)
|
||||
|
||||
def test_supported_mime_types_invalid_config(self):
|
||||
parser = get_parser(uuid.uuid4())
|
||||
self.assertEqual(parser.supported_mime_types(), {})
|
||||
|
||||
@override_settings(
|
||||
REMOTE_OCR_ENGINE=None,
|
||||
REMOTE_OCR_API_KEY=None,
|
||||
REMOTE_OCR_ENDPOINT=None,
|
||||
)
|
||||
def test_parse_with_invalid_config(self):
|
||||
parser = get_parser(uuid.uuid4())
|
||||
parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf")
|
||||
self.assertEqual(parser.text, "")
|
||||
39
uv.lock
generated
39
uv.lock
generated
@@ -95,6 +95,34 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/02/ff/1175b0b7371e46244032d43a56862d0af455823b5280a50c63d99cc50f18/automat-25.4.16-py3-none-any.whl", hash = "sha256:04e9bce696a8d5671ee698005af6e5a9fa15354140a87f4870744604dcdd3ba1", size = 42842, upload-time = "2025-04-16T20:12:14.447Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "azure-ai-documentintelligence"
|
||||
version = "1.0.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "azure-core", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "isodate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/44/7b/8115cd713e2caa5e44def85f2b7ebd02a74ae74d7113ba20bdd41fd6dd80/azure_ai_documentintelligence-1.0.2.tar.gz", hash = "sha256:4d75a2513f2839365ebabc0e0e1772f5601b3a8c9a71e75da12440da13b63484", size = 170940 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d9/75/c9ec040f23082f54ffb1977ff8f364c2d21c79a640a13d1c1809e7fd6b1a/azure_ai_documentintelligence-1.0.2-py3-none-any.whl", hash = "sha256:e1fb446abbdeccc9759d897898a0fe13141ed29f9ad11fc705f951925822ed59", size = 106005 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "azure-core"
|
||||
version = "1.33.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "six", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/75/aa/7c9db8edd626f1a7d99d09ef7926f6f4fb34d5f9fa00dc394afdfe8e2a80/azure_core-1.33.0.tar.gz", hash = "sha256:f367aa07b5e3005fec2c1e184b882b0b039910733907d001c20fb08ebb8c0eb9", size = 295633 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/07/b7/76b7e144aa53bd206bf1ce34fa75350472c3f69bf30e5c8c18bc9881035d/azure_core-1.33.0-py3-none-any.whl", hash = "sha256:9b5b6d0223a1d38c37500e6971118c1e0f13f54951e6893968b38910bc9cda8f", size = 207071 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "babel"
|
||||
version = "2.17.0"
|
||||
@@ -1451,6 +1479,15 @@ wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/c7/fc/4e5a141c3f7c7bed550ac1f69e599e92b6be449dd4677ec09f325cad0955/inotifyrecursive-0.3.5-py3-none-any.whl", hash = "sha256:7e5f4a2e1dc2bef0efa3b5f6b339c41fb4599055a2b54909d020e9e932cc8d2f", size = 8009, upload-time = "2020-11-20T12:38:46.981Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "isodate"
|
||||
version = "0.7.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705 }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320 },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jinja2"
|
||||
version = "3.1.6"
|
||||
@@ -2118,6 +2155,7 @@ name = "paperless-ngx"
|
||||
version = "2.20.3"
|
||||
source = { virtual = "." }
|
||||
dependencies = [
|
||||
{ name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "babel", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
{ name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||
@@ -2255,6 +2293,7 @@ typing = [
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "azure-ai-documentintelligence", specifier = ">=1.0.2" },
|
||||
{ name = "babel", specifier = ">=2.17" },
|
||||
{ name = "bleach", specifier = "~=6.3.0" },
|
||||
{ name = "celery", extras = ["redis"], specifier = "~=5.5.1" },
|
||||
|
||||
Reference in New Issue
Block a user