Feature: Allow setting backend configuration settings via the UI (#5126)

* Saving some start on this

* At least partially working for the tesseract parser

* Problems with migration testing need to figure out

* Work around that error

* Fixes max m_pixels

* Moving the settings to main paperless application

* Starting some consumer options

* More fixes and work

* Fixes these last tests

* Fix max_length on OcrSettings.mode field

* Fix all fields on Common & Ocr settings serializers

* Umbrellla config view

* Revert "Umbrellla config view"

This reverts commit fbaf9f4be30f89afeb509099180158a3406416a5.

* Updates to use a single configuration object for all settings

* Squashed commit of the following:

commit 8a0a49dd5766094f60462fbfbe62e9921fbd2373
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 23:02:47 2023 -0800

    Fix formatting

commit 66b2d90c507b8afd9507813ff555e46198ea33b9
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 22:36:35 2023 -0800

    Refactor frontend data models

commit 5723bd8dd823ee855625e250df39393e26709d48
Author: Adam Bogdał <adam@bogdal.pl>
Date:   Wed Dec 20 01:17:43 2023 +0100

    Fix: speed up admin panel for installs with a large number of documents (#5052)

commit 9b08ce176199bf9011a6634bb88f616846150d2b
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:18:51 2023 -0800

    Update PULL_REQUEST_TEMPLATE.md

commit a6248bec2d793b7690feed95fcaf5eb34a75bfb6
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 15:02:05 2023 -0800

    Chore: Update Angular to v17 (#4980)

commit b1f6f52486d5ba5c04af99b41315eb6428fd1fa8
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:53:56 2023 -0800

    Fix: Dont allow null custom_fields property via API (#5063)

commit 638d9970fd468d8c02c91d19bd28f8b0796bdcb1
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 13:43:50 2023 -0800

    Enhancement: symmetric document links (#4907)

commit 5e8de4c1da6eb4eb8f738b20962595c7536b30ec
Author: shamoon <4887959+shamoon@users.noreply.github.com>
Date:   Tue Dec 19 12:45:04 2023 -0800

    Enhancement: shared icon & shared by me filter (#4859)

commit 088bad90306025d3f6b139cbd0ad264a1cbecfe5
Author: Trenton H <797416+stumpylog@users.noreply.github.com>
Date:   Tue Dec 19 12:04:03 2023 -0800

    Bulk updates all the backend libraries (#5061)

* Saving some work on frontend config

* Very basic but dynamically-generated config form

* Saving work on slightly less ugly frontend config

* JSON validation for user_args field

* Fully dynamic config form

* Adds in some additional validators for a nicer error message

* Cleaning up the testing and coverage more

* Reverts unintentional change

* Adds documentation about the settings and the precedence

* Couple more commenting and style fixes

---------

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
This commit is contained in:
Trenton H 2023-12-29 15:42:56 -08:00 committed by GitHub
parent da058b915b
commit 061f33fb05
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
41 changed files with 1570 additions and 119 deletions

View File

@ -3,6 +3,11 @@
Paperless provides a wide range of customizations. Depending on how you
run paperless, these settings have to be defined in different places.
Certain configuration options may be set via the UI. This currently includes
common [OCR](#ocr) related settings. If set, these will take preference over the
settings via environment variables. If not set, the environment setting or applicable
default will be utilized instead.
- If you run paperless on docker, `paperless.conf` is not used.
Rather, configure paperless by copying necessary options to
`docker-compose.env`.

View File

@ -25,6 +25,7 @@ import { ConsumptionTemplatesComponent } from './components/manage/consumption-t
import { MailComponent } from './components/manage/mail/mail.component'
import { UsersAndGroupsComponent } from './components/admin/users-groups/users-groups.component'
import { CustomFieldsComponent } from './components/manage/custom-fields/custom-fields.component'
import { ConfigComponent } from './components/admin/config/config.component'
export const routes: Routes = [
{ path: '', redirectTo: 'dashboard', pathMatch: 'full' },
@ -179,6 +180,17 @@ export const routes: Routes = [
},
},
},
{
path: 'config',
component: ConfigComponent,
canActivate: [PermissionsGuard],
data: {
requiredPermission: {
action: PermissionAction.View,
type: PermissionType.Admin,
},
},
},
{
path: 'tasks',
component: TasksComponent,

View File

@ -108,6 +108,8 @@ import { ProfileEditDialogComponent } from './components/common/profile-edit-dia
import { PdfViewerComponent } from './components/common/pdf-viewer/pdf-viewer.component'
import { DocumentLinkComponent } from './components/common/input/document-link/document-link.component'
import { PreviewPopupComponent } from './components/common/preview-popup/preview-popup.component'
import { ConfigComponent } from './components/admin/config/config.component'
import { SwitchComponent } from './components/common/input/switch/switch.component'
import localeAf from '@angular/common/locales/af'
import localeAr from '@angular/common/locales/ar'
@ -263,6 +265,8 @@ function initializeApp(settings: SettingsService) {
PdfViewerComponent,
DocumentLinkComponent,
PreviewPopupComponent,
ConfigComponent,
SwitchComponent,
],
imports: [
BrowserModule,

View File

@ -0,0 +1,54 @@
<pngx-page-header title="Configuration" i18n-title></pngx-page-header>
<form [formGroup]="configForm" (ngSubmit)="saveConfig()" class="pb-4">
<ul ngbNav #nav="ngbNav" class="nav-tabs">
@for (category of optionCategories; track category) {
<li [ngbNavItem]="category">
<a ngbNavLink i18n>{{category}}</a>
<ng-template ngbNavContent>
<div class="p-3">
<div class="row row-cols-1 row-cols-md-2 row-cols-lg-3 g-2">
@for (option of getCategoryOptions(category); track option.key) {
<div class="col">
<div class="card bg-light">
<div class="card-body">
<div class="card-title">
<h6>
{{option.title}}
<a class="btn btn-sm btn-link" title="Read the documentation about this setting" i18n-title [href]="getDocsUrl(option.config_key)" target="_blank" referrerpolicy="no-referrer">
<svg class="sidebaricon" fill="currentColor">
<use xlink:href="assets/bootstrap-icons.svg#info-circle"/>
</svg>
</a>
</h6>
</div>
<div class="mb-n3">
@switch (option.type) {
@case (ConfigOptionType.Select) { <pngx-input-select [formControlName]="option.key" [error]="errors[option.key]" [items]="option.choices" [allowNull]="true"></pngx-input-select> }
@case (ConfigOptionType.Number) { <pngx-input-number [formControlName]="option.key" [error]="errors[option.key]" [showAdd]="false"></pngx-input-number> }
@case (ConfigOptionType.Boolean) { <pngx-input-switch [formControlName]="option.key" [error]="errors[option.key]" title="Enable" i18n-title></pngx-input-switch> }
@case (ConfigOptionType.String) { <pngx-input-text [formControlName]="option.key" [error]="errors[option.key]"></pngx-input-text> }
@case (ConfigOptionType.JSON) { <pngx-input-text [formControlName]="option.key" [error]="errors[option.key]"></pngx-input-text> }
}
</div>
</div>
</div>
</div>
}
</div>
</div>
</ng-template>
</li>
}
</ul>
<div [ngbNavOutlet]="nav" class="border-start border-end border-bottom p-3 mb-3 shadow-sm"></div>
<div class="btn-toolbar" role="toolbar">
<div class="btn-group me-2">
<button type="button" (click)="discardChanges()" class="btn btn-secondary" [disabled]="loading || (isDirty$ | async) === false" i18n>Discard</button>
</div>
<div class="btn-group">
<button type="submit" class="btn btn-primary" [disabled]="loading || !configForm.valid || (isDirty$ | async) === false" i18n>Save</button>
</div>
</div>
</form>

View File

@ -0,0 +1,103 @@
import { ComponentFixture, TestBed } from '@angular/core/testing'
import { ConfigComponent } from './config.component'
import { ConfigService } from 'src/app/services/config.service'
import { ToastService } from 'src/app/services/toast.service'
import { of, throwError } from 'rxjs'
import { OutputTypeConfig } from 'src/app/data/paperless-config'
import { HttpClientTestingModule } from '@angular/common/http/testing'
import { BrowserModule } from '@angular/platform-browser'
import { NgbModule } from '@ng-bootstrap/ng-bootstrap'
import { NgSelectModule } from '@ng-select/ng-select'
import { TextComponent } from '../../common/input/text/text.component'
import { NumberComponent } from '../../common/input/number/number.component'
import { SwitchComponent } from '../../common/input/switch/switch.component'
import { FormsModule, ReactiveFormsModule } from '@angular/forms'
import { PageHeaderComponent } from '../../common/page-header/page-header.component'
import { SelectComponent } from '../../common/input/select/select.component'
describe('ConfigComponent', () => {
let component: ConfigComponent
let fixture: ComponentFixture<ConfigComponent>
let configService: ConfigService
let toastService: ToastService
beforeEach(async () => {
await TestBed.configureTestingModule({
declarations: [
ConfigComponent,
TextComponent,
SelectComponent,
NumberComponent,
SwitchComponent,
PageHeaderComponent,
],
imports: [
HttpClientTestingModule,
BrowserModule,
NgbModule,
NgSelectModule,
FormsModule,
ReactiveFormsModule,
],
}).compileComponents()
configService = TestBed.inject(ConfigService)
toastService = TestBed.inject(ToastService)
fixture = TestBed.createComponent(ConfigComponent)
component = fixture.componentInstance
fixture.detectChanges()
})
it('should load config on init, show error if necessary', () => {
const getSpy = jest.spyOn(configService, 'getConfig')
const errorSpy = jest.spyOn(toastService, 'showError')
getSpy.mockReturnValueOnce(
throwError(() => new Error('Error getting config'))
)
component.ngOnInit()
expect(getSpy).toHaveBeenCalled()
expect(errorSpy).toHaveBeenCalled()
getSpy.mockReturnValueOnce(
of({ output_type: OutputTypeConfig.PDF_A } as any)
)
component.ngOnInit()
expect(component.initialConfig).toEqual({
output_type: OutputTypeConfig.PDF_A,
})
})
it('should save config, show error if necessary', () => {
const saveSpy = jest.spyOn(configService, 'saveConfig')
const errorSpy = jest.spyOn(toastService, 'showError')
saveSpy.mockReturnValueOnce(
throwError(() => new Error('Error saving config'))
)
component.saveConfig()
expect(saveSpy).toHaveBeenCalled()
expect(errorSpy).toHaveBeenCalled()
saveSpy.mockReturnValueOnce(
of({ output_type: OutputTypeConfig.PDF_A } as any)
)
component.saveConfig()
expect(component.initialConfig).toEqual({
output_type: OutputTypeConfig.PDF_A,
})
})
it('should support discard changes', () => {
component.initialConfig = { output_type: OutputTypeConfig.PDF_A2 } as any
component.configForm.patchValue({ output_type: OutputTypeConfig.PDF_A })
component.discardChanges()
expect(component.configForm.get('output_type').value).toEqual(
OutputTypeConfig.PDF_A2
)
})
it('should support JSON validation for e.g. user_args', () => {
component.configForm.patchValue({ user_args: '{ foo bar }' })
expect(component.errors).toEqual({ user_args: 'Invalid JSON' })
component.configForm.patchValue({ user_args: '{ "foo": "bar" }' })
expect(component.errors).toEqual({ user_args: null })
})
})

View File

@ -0,0 +1,163 @@
import { Component, OnDestroy, OnInit } from '@angular/core'
import { AbstractControl, FormControl, FormGroup } from '@angular/forms'
import {
BehaviorSubject,
Observable,
Subject,
Subscription,
first,
takeUntil,
} from 'rxjs'
import {
PaperlessConfigOptions,
ConfigCategory,
ConfigOption,
ConfigOptionType,
PaperlessConfig,
} from 'src/app/data/paperless-config'
import { ConfigService } from 'src/app/services/config.service'
import { ToastService } from 'src/app/services/toast.service'
import { ComponentWithPermissions } from '../../with-permissions/with-permissions.component'
import { DirtyComponent, dirtyCheck } from '@ngneat/dirty-check-forms'
@Component({
selector: 'pngx-config',
templateUrl: './config.component.html',
styleUrl: './config.component.scss',
})
export class ConfigComponent
extends ComponentWithPermissions
implements OnInit, OnDestroy, DirtyComponent
{
public readonly ConfigOptionType = ConfigOptionType
// generated dynamically
public configForm = new FormGroup({})
public errors = {}
get optionCategories(): string[] {
return Object.values(ConfigCategory)
}
getCategoryOptions(category: string): ConfigOption[] {
return PaperlessConfigOptions.filter((o) => o.category === category)
}
public loading: boolean = false
initialConfig: PaperlessConfig
store: BehaviorSubject<any>
storeSub: Subscription
isDirty$: Observable<boolean>
private unsubscribeNotifier: Subject<any> = new Subject()
constructor(
private configService: ConfigService,
private toastService: ToastService
) {
super()
this.configForm.addControl('id', new FormControl())
PaperlessConfigOptions.forEach((option) => {
this.configForm.addControl(option.key, new FormControl())
})
}
ngOnInit(): void {
this.loading = true
this.configService
.getConfig()
.pipe(takeUntil(this.unsubscribeNotifier))
.subscribe({
next: (config) => {
this.loading = false
this.initialize(config)
},
error: (e) => {
this.loading = false
this.toastService.showError($localize`Error retrieving config`, e)
},
})
// validate JSON inputs
PaperlessConfigOptions.filter(
(o) => o.type === ConfigOptionType.JSON
).forEach((option) => {
this.configForm
.get(option.key)
.addValidators((control: AbstractControl) => {
if (!control.value || control.value.toString().length === 0)
return null
try {
JSON.parse(control.value)
} catch (e) {
return [
{
user_args: e,
},
]
}
return null
})
this.configForm.get(option.key).statusChanges.subscribe((status) => {
this.errors[option.key] =
status === 'INVALID' ? $localize`Invalid JSON` : null
})
this.configForm.get(option.key).updateValueAndValidity()
})
}
ngOnDestroy(): void {
this.unsubscribeNotifier.next(true)
this.unsubscribeNotifier.complete()
}
private initialize(config: PaperlessConfig) {
if (!this.store) {
this.store = new BehaviorSubject(config)
this.store
.asObservable()
.pipe(takeUntil(this.unsubscribeNotifier))
.subscribe((state) => {
this.configForm.patchValue(state, { emitEvent: false })
})
this.isDirty$ = dirtyCheck(this.configForm, this.store.asObservable())
}
this.configForm.patchValue(config)
this.initialConfig = config
}
getDocsUrl(key: string) {
return `https://docs.paperless-ngx.com/configuration/#${key}`
}
public saveConfig() {
this.loading = true
this.configService
.saveConfig(this.configForm.value as PaperlessConfig)
.pipe(takeUntil(this.unsubscribeNotifier), first())
.subscribe({
next: (config) => {
this.loading = false
this.initialize(config)
this.store.next(config)
this.toastService.showInfo($localize`Configuration updated`)
},
error: (e) => {
this.loading = false
this.toastService.showError(
$localize`An error occurred updating configuration`,
e
)
},
})
}
public discardChanges() {
this.configForm.reset(this.initialConfig)
}
}

View File

@ -271,6 +271,15 @@
</svg><span>&nbsp;<ng-container i18n>Settings</ng-container></span>
</a>
</li>
<li class="nav-item" *pngxIfPermissions="{ action: PermissionAction.View, type: PermissionType.Admin }">
<a class="nav-link" routerLink="config" routerLinkActive="active" (click)="closeMenu()"
ngbPopover="Configuration" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end"
container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim">
<svg class="sidebaricon" fill="currentColor">
<use xlink:href="assets/bootstrap-icons.svg#sliders2-vertical" />
</svg><span>&nbsp;<ng-container i18n>Configuration</ng-container></span>
</a>
</li>
<li class="nav-item" *pngxIfPermissions="{ action: PermissionAction.View, type: PermissionType.User }">
<a class="nav-link" routerLink="usersgroups" routerLinkActive="active" (click)="closeMenu()"
ngbPopover="Users & Groups" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end"

View File

@ -1,7 +1,9 @@
<div class="mb-3" [class.pb-3]="error">
<div class="row">
<div class="d-flex align-items-center position-relative hidden-button-container" [class.col-md-3]="horizontal">
<label class="form-label" [class.mb-md-0]="horizontal" [for]="inputId">{{title}}</label>
@if (title) {
<label class="form-label" [class.mb-md-0]="horizontal" [for]="inputId">{{title}}</label>
}
@if (removable) {
<button type="button" class="btn btn-sm btn-danger position-absolute left-0" (click)="removed.emit(this)">
<svg class="sidebaricon" fill="currentColor">

View File

@ -0,0 +1,27 @@
<div class="mb-3">
<div class="row">
@if (horizontal) {
<div class="d-flex align-items-center position-relative hidden-button-container col-md-3">
<label class="form-label" [class.mb-md-0]="horizontal" [for]="inputId">{{title}}</label>
@if (removable) {
<button type="button" class="btn btn-sm btn-danger position-absolute left-0" (click)="removed.emit(this)">
<svg class="sidebaricon" fill="currentColor">
<use xlink:href="assets/bootstrap-icons.svg#x"/>
</svg>&nbsp;<ng-container i18n>Remove</ng-container>
</button>
}
</div>
}
<div [ngClass]="{'col-md-9': horizontal, 'align-items-center': horizontal, 'd-flex': horizontal}">
<div class="form-check form-switch">
<input #inputField type="checkbox" class="form-check-input" [id]="inputId" [(ngModel)]="value" (change)="onChange(value)" (blur)="onTouched()" [disabled]="disabled">
@if (!horizontal) {
<label class="form-check-label" [for]="inputId">{{title}}</label>
}
@if (hint) {
<div class="form-text text-muted">{{hint}}</div>
}
</div>
</div>
</div>
</div>

View File

@ -0,0 +1,39 @@
import { ComponentFixture, TestBed } from '@angular/core/testing'
import { SwitchComponent } from './switch.component'
import {
FormsModule,
NG_VALUE_ACCESSOR,
ReactiveFormsModule,
} from '@angular/forms'
describe('SwitchComponent', () => {
let component: SwitchComponent
let fixture: ComponentFixture<SwitchComponent>
let input: HTMLInputElement
beforeEach(async () => {
TestBed.configureTestingModule({
declarations: [SwitchComponent],
providers: [],
imports: [FormsModule, ReactiveFormsModule],
}).compileComponents()
fixture = TestBed.createComponent(SwitchComponent)
fixture.debugElement.injector.get(NG_VALUE_ACCESSOR)
component = fixture.componentInstance
fixture.detectChanges()
input = component.inputField.nativeElement
})
it('should support use of checkbox', () => {
input.checked = true
input.dispatchEvent(new Event('change'))
fixture.detectChanges()
expect(component.value).toBeTruthy()
input.checked = false
input.dispatchEvent(new Event('change'))
fixture.detectChanges()
expect(component.value).toBeFalsy()
})
})

View File

@ -0,0 +1,21 @@
import { Component, forwardRef } from '@angular/core'
import { NG_VALUE_ACCESSOR } from '@angular/forms'
import { AbstractInputComponent } from '../abstract-input'
@Component({
providers: [
{
provide: NG_VALUE_ACCESSOR,
useExisting: forwardRef(() => SwitchComponent),
multi: true,
},
],
selector: 'pngx-input-switch',
templateUrl: './switch.component.html',
styleUrls: ['./switch.component.scss'],
})
export class SwitchComponent extends AbstractInputComponent<boolean> {
constructor() {
super()
}
}

View File

@ -1,7 +1,9 @@
<div class="mb-3" [class.pb-3]="error">
<div class="row">
<div class="d-flex align-items-center position-relative hidden-button-container" [class.col-md-3]="horizontal">
<label class="form-label" [class.mb-md-0]="horizontal" [for]="inputId">{{title}}</label>
@if (title) {
<label class="form-label" [class.mb-md-0]="horizontal" [for]="inputId">{{title}}</label>
}
@if (removable) {
<button type="button" class="btn btn-sm btn-danger position-absolute left-0" (click)="removed.emit(this)">
<svg class="sidebaricon" fill="currentColor">

View File

@ -0,0 +1,183 @@
import { ObjectWithId } from './object-with-id'
// see /src/paperless/models.py
export enum OutputTypeConfig {
PDF = 'pdf',
PDF_A = 'pdfa',
PDF_A1 = 'pdfa-1',
PDF_A2 = 'pdfa-2',
PDF_A3 = 'pdfa-3',
}
export enum ModeConfig {
SKIP = 'skip',
REDO = 'redo',
FORCE = 'force',
SKIP_NO_ARCHIVE = 'skip_noarchive',
}
export enum ArchiveFileConfig {
NEVER = 'never',
WITH_TEXT = 'with_text',
ALWAYS = 'always',
}
export enum CleanConfig {
CLEAN = 'clean',
FINAL = 'clean-final',
NONE = 'none',
}
export enum ColorConvertConfig {
UNCHANGED = 'LeaveColorUnchanged',
RGB = 'RGB',
INDEPENDENT = 'UseDeviceIndependentColor',
GRAY = 'Gray',
CMYK = 'CMYK',
}
export enum ConfigOptionType {
String = 'string',
Number = 'number',
Select = 'select',
Boolean = 'boolean',
JSON = 'json',
}
export const ConfigCategory = {
OCR: $localize`OCR Settings`,
}
export interface ConfigOption {
key: string
title: string
type: ConfigOptionType
choices?: Array<{ id: string; name: string }>
config_key?: string
category: string
}
function mapToItems(enumObj: Object): Array<{ id: string; name: string }> {
return Object.keys(enumObj).map((key) => {
return {
id: enumObj[key],
name: enumObj[key],
}
})
}
export const PaperlessConfigOptions: ConfigOption[] = [
{
key: 'output_type',
title: $localize`Output Type`,
type: ConfigOptionType.Select,
choices: mapToItems(OutputTypeConfig),
config_key: 'PAPERLESS_OCR_OUTPUT_TYPE',
category: ConfigCategory.OCR,
},
{
key: 'language',
title: $localize`Language`,
type: ConfigOptionType.String,
config_key: 'PAPERLESS_OCR_LANGUAGE',
category: ConfigCategory.OCR,
},
{
key: 'pages',
title: $localize`Pages`,
type: ConfigOptionType.Number,
config_key: 'PAPERLESS_OCR_PAGES',
category: ConfigCategory.OCR,
},
{
key: 'mode',
title: $localize`Mode`,
type: ConfigOptionType.Select,
choices: mapToItems(ModeConfig),
config_key: 'PAPERLESS_OCR_MODE',
category: ConfigCategory.OCR,
},
{
key: 'skip_archive_file',
title: $localize`Skip Archive File`,
type: ConfigOptionType.Select,
choices: mapToItems(ArchiveFileConfig),
config_key: 'PAPERLESS_OCR_SKIP_ARCHIVE_FILE',
category: ConfigCategory.OCR,
},
{
key: 'image_dpi',
title: $localize`Image DPI`,
type: ConfigOptionType.Number,
config_key: 'PAPERLESS_OCR_IMAGE_DPI',
category: ConfigCategory.OCR,
},
{
key: 'unpaper_clean',
title: $localize`Clean`,
type: ConfigOptionType.Select,
choices: mapToItems(CleanConfig),
config_key: 'PAPERLESS_OCR_CLEAN',
category: ConfigCategory.OCR,
},
{
key: 'deskew',
title: $localize`Deskew`,
type: ConfigOptionType.Boolean,
config_key: 'PAPERLESS_OCR_DESKEW',
category: ConfigCategory.OCR,
},
{
key: 'rotate_pages',
title: $localize`Rotate Pages`,
type: ConfigOptionType.Boolean,
config_key: 'PAPERLESS_OCR_ROTATE_PAGES',
category: ConfigCategory.OCR,
},
{
key: 'rotate_pages_threshold',
title: $localize`Rotate Pages Threshold`,
type: ConfigOptionType.Number,
config_key: 'PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD',
category: ConfigCategory.OCR,
},
{
key: 'max_image_pixels',
title: $localize`Max Image Pixels`,
type: ConfigOptionType.Number,
config_key: 'PAPERLESS_OCR_IMAGE_DPI',
category: ConfigCategory.OCR,
},
{
key: 'color_conversion_strategy',
title: $localize`Color Conversion Strategy`,
type: ConfigOptionType.Select,
choices: mapToItems(ColorConvertConfig),
config_key: 'PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY',
category: ConfigCategory.OCR,
},
{
key: 'user_args',
title: $localize`OCR Arguments`,
type: ConfigOptionType.JSON,
config_key: 'PAPERLESS_OCR_USER_ARGS',
category: ConfigCategory.OCR,
},
]
export interface PaperlessConfig extends ObjectWithId {
output_type: OutputTypeConfig
pages: number
language: string
mode: ModeConfig
skip_archive_file: ArchiveFileConfig
image_dpi: number
unpaper_clean: CleanConfig
deskew: boolean
rotate_pages: boolean
rotate_pages_threshold: number
max_image_pixels: number
color_conversion_strategy: ColorConvertConfig
user_args: object
}

View File

@ -0,0 +1,42 @@
import { TestBed } from '@angular/core/testing'
import { ConfigService } from './config.service'
import {
HttpClientTestingModule,
HttpTestingController,
} from '@angular/common/http/testing'
import { environment } from 'src/environments/environment'
import { OutputTypeConfig, PaperlessConfig } from '../data/paperless-config'
describe('ConfigService', () => {
let service: ConfigService
let httpTestingController: HttpTestingController
beforeEach(() => {
TestBed.configureTestingModule({
imports: [HttpClientTestingModule],
})
service = TestBed.inject(ConfigService)
httpTestingController = TestBed.inject(HttpTestingController)
})
it('should call correct API endpoint on get config', () => {
service.getConfig().subscribe()
httpTestingController
.expectOne(`${environment.apiBaseUrl}config/`)
.flush([{}])
})
it('should call correct API endpoint on set config', () => {
service
.saveConfig({
id: 1,
output_type: OutputTypeConfig.PDF_A,
} as PaperlessConfig)
.subscribe()
const req = httpTestingController.expectOne(
`${environment.apiBaseUrl}config/1/`
)
expect(req.request.method).toEqual('PATCH')
})
})

View File

@ -0,0 +1,27 @@
import { HttpClient } from '@angular/common/http'
import { Injectable } from '@angular/core'
import { Observable, first, map } from 'rxjs'
import { environment } from 'src/environments/environment'
import { PaperlessConfig } from '../data/paperless-config'
@Injectable({
providedIn: 'root',
})
export class ConfigService {
protected baseUrl: string = environment.apiBaseUrl + 'config/'
constructor(protected http: HttpClient) {}
getConfig(): Observable<PaperlessConfig> {
return this.http.get<[PaperlessConfig]>(this.baseUrl).pipe(
first(),
map((configs) => configs[0])
)
}
saveConfig(config: PaperlessConfig): Observable<PaperlessConfig> {
return this.http
.patch<PaperlessConfig>(`${this.baseUrl}${config.id}/`, config)
.pipe(first())
}
}

View File

@ -52,7 +52,7 @@ def load_classifier() -> Optional["DocumentClassifier"]:
except OSError:
logger.exception("IO error while loading document classification model")
classifier = None
except Exception: # pragma: nocover
except Exception: # pragma: no cover
logger.exception("Unknown error while loading document classification model")
classifier = None
@ -318,7 +318,7 @@ class DocumentClassifier:
return True
def preprocess_content(self, content: str) -> str: # pragma: nocover
def preprocess_content(self, content: str) -> str: # pragma: no cover
"""
Process to contents of a document, distilling it down into
words which are meaningful to the content

View File

@ -420,7 +420,7 @@ class Consumer(LoggingMixin):
document_parser: DocumentParser = parser_class(
self.logging_group,
progress_callback,
progress_callback=progress_callback,
)
self.log.debug(f"Parser: {type(document_parser).__name__}")

View File

@ -26,7 +26,7 @@ from documents.tasks import consume_file
try:
from inotifyrecursive import INotify
from inotifyrecursive import flags
except ImportError: # pragma: nocover
except ImportError: # pragma: no cover
INotify = flags = None
logger = logging.getLogger("paperless.management.consumer")

View File

@ -41,6 +41,7 @@ from documents.settings import EXPORTER_THUMBNAIL_NAME
from documents.utils import copy_file_with_basic_stats
from paperless import version
from paperless.db import GnuPG
from paperless.models import ApplicationConfiguration
from paperless_mail.models import MailAccount
from paperless_mail.models import MailRule
@ -291,6 +292,10 @@ class Command(BaseCommand):
serializers.serialize("json", CustomField.objects.all()),
)
manifest += json.loads(
serializers.serialize("json", ApplicationConfiguration.objects.all()),
)
# These are treated specially and included in the per-document manifest
# if that setting is enabled. Otherwise, they are just exported to the bulk
# manifest

View File

@ -5,7 +5,7 @@ from django.core.management.commands.loaddata import Command as LoadDataCommand
# This class is used to migrate data between databases
# That's difficult to test
class Command(LoadDataCommand): # pragma: nocover
class Command(LoadDataCommand): # pragma: no cover
"""
Allow the loading of data from standard in. Sourced originally from:
https://gist.github.com/bmispelon/ad5a2c333443b3a1d051 (MIT licensed)

View File

@ -125,8 +125,10 @@ def get_parser_class_for_mime_type(mime_type: str) -> Optional[type["DocumentPar
if not options:
return None
best_parser = sorted(options, key=lambda _: _["weight"], reverse=True)[0]
# Return the parser with the highest weight.
return sorted(options, key=lambda _: _["weight"], reverse=True)[0]["parser"]
return best_parser["parser"]
def run_convert(
@ -318,6 +320,7 @@ class DocumentParser(LoggingMixin):
def __init__(self, logging_group, progress_callback=None):
super().__init__()
self.logging_group = logging_group
self.settings = self.get_settings()
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
self.tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
@ -330,6 +333,12 @@ class DocumentParser(LoggingMixin):
if self.progress_callback:
self.progress_callback(current_progress, max_progress)
def get_settings(self): # pragma: no cover
"""
A parser must implement this
"""
raise NotImplementedError
def read_file_handle_unicode_errors(self, filepath: Path) -> str:
"""
Helper utility for reading from a file, and handling a problem with its

View File

@ -172,7 +172,15 @@ class TestFieldPermutations(TestCase):
self.assertEqual(info.title, "anotherall")
class DummyParser(DocumentParser):
class _BaseTestParser(DocumentParser):
def get_settings(self):
"""
This parser does not implement additional settings yet
"""
return None
class DummyParser(_BaseTestParser):
def __init__(self, logging_group, scratch_dir, archive_path):
super().__init__(logging_group, None)
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
@ -185,7 +193,7 @@ class DummyParser(DocumentParser):
self.text = "The Text"
class CopyParser(DocumentParser):
class CopyParser(_BaseTestParser):
def get_thumbnail(self, document_path, mime_type, file_name=None):
return self.fake_thumb
@ -199,7 +207,7 @@ class CopyParser(DocumentParser):
shutil.copy(document_path, self.archive_path)
class FaultyParser(DocumentParser):
class FaultyParser(_BaseTestParser):
def __init__(self, logging_group, scratch_dir):
super().__init__(logging_group)
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)
@ -211,7 +219,7 @@ class FaultyParser(DocumentParser):
raise ParseError("Does not compute.")
class FaultyGenericExceptionParser(DocumentParser):
class FaultyGenericExceptionParser(_BaseTestParser):
def __init__(self, logging_group, scratch_dir):
super().__init__(logging_group)
_, self.fake_thumb = tempfile.mkstemp(suffix=".webp", dir=scratch_dir)

View File

@ -168,7 +168,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
manifest = self._do_export(use_filename_format=use_filename_format)
self.assertEqual(len(manifest), 172)
self.assertEqual(len(manifest), 178)
# dont include consumer or AnonymousUser users
self.assertEqual(
@ -262,7 +262,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertEqual(Document.objects.get(id=self.d4.id).title, "wow_dec")
self.assertEqual(GroupObjectPermission.objects.count(), 1)
self.assertEqual(UserObjectPermission.objects.count(), 1)
self.assertEqual(Permission.objects.count(), 124)
self.assertEqual(Permission.objects.count(), 128)
messages = check_sanity()
# everything is alright after the test
self.assertEqual(len(messages), 0)
@ -694,15 +694,15 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
os.path.join(self.dirs.media_dir, "documents"),
)
self.assertEqual(ContentType.objects.count(), 31)
self.assertEqual(Permission.objects.count(), 124)
self.assertEqual(ContentType.objects.count(), 32)
self.assertEqual(Permission.objects.count(), 128)
manifest = self._do_export()
with paperless_environment():
self.assertEqual(
len(list(filter(lambda e: e["model"] == "auth.permission", manifest))),
124,
128,
)
# add 1 more to db to show objects are not re-created by import
Permission.objects.create(
@ -710,7 +710,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
codename="test_perm",
content_type_id=1,
)
self.assertEqual(Permission.objects.count(), 125)
self.assertEqual(Permission.objects.count(), 129)
# will cause an import error
self.user.delete()
@ -719,5 +719,5 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
with self.assertRaises(IntegrityError):
call_command("document_importer", "--no-progress-bar", self.target)
self.assertEqual(ContentType.objects.count(), 31)
self.assertEqual(Permission.objects.count(), 125)
self.assertEqual(ContentType.objects.count(), 32)
self.assertEqual(Permission.objects.count(), 129)

88
src/paperless/config.py Normal file
View File

@ -0,0 +1,88 @@
import dataclasses
import json
from typing import Optional
from django.conf import settings
from paperless.models import ApplicationConfiguration
@dataclasses.dataclass
class OutputTypeConfig:
"""
Almost all parsers care about the chosen PDF output format
"""
output_type: str = dataclasses.field(init=False)
@staticmethod
def _get_config_instance() -> ApplicationConfiguration:
app_config = ApplicationConfiguration.objects.all().first()
# Workaround for a test where the migration hasn't run to create the single model
if app_config is None:
ApplicationConfiguration.objects.create()
app_config = ApplicationConfiguration.objects.all().first()
return app_config
def __post_init__(self) -> None:
app_config = self._get_config_instance()
self.output_type = app_config.output_type or settings.OCR_OUTPUT_TYPE
@dataclasses.dataclass
class OcrConfig(OutputTypeConfig):
"""
Specific settings for the Tesseract based parser. Options generally
correspond almost directly to the OCRMyPDF options
"""
pages: Optional[int] = dataclasses.field(init=False)
language: str = dataclasses.field(init=False)
mode: str = dataclasses.field(init=False)
skip_archive_file: str = dataclasses.field(init=False)
image_dpi: Optional[int] = dataclasses.field(init=False)
clean: str = dataclasses.field(init=False)
deskew: bool = dataclasses.field(init=False)
rotate: bool = dataclasses.field(init=False)
rotate_threshold: float = dataclasses.field(init=False)
max_image_pixel: Optional[float] = dataclasses.field(init=False)
color_conversion_strategy: str = dataclasses.field(init=False)
user_args: Optional[dict[str, str]] = dataclasses.field(init=False)
def __post_init__(self) -> None:
super().__post_init__()
app_config = self._get_config_instance()
self.pages = app_config.pages or settings.OCR_PAGES
self.language = app_config.language or settings.OCR_LANGUAGE
self.mode = app_config.mode or settings.OCR_MODE
self.skip_archive_file = (
app_config.skip_archive_file or settings.OCR_SKIP_ARCHIVE_FILE
)
self.image_dpi = app_config.image_dpi or settings.OCR_IMAGE_DPI
self.clean = app_config.unpaper_clean or settings.OCR_CLEAN
self.deskew = app_config.deskew or settings.OCR_DESKEW
self.rotate = app_config.rotate_pages or settings.OCR_ROTATE_PAGES
self.rotate_threshold = (
app_config.rotate_pages_threshold or settings.OCR_ROTATE_PAGES_THRESHOLD
)
self.max_image_pixel = (
app_config.max_image_pixels or settings.OCR_MAX_IMAGE_PIXELS
)
self.color_conversion_strategy = (
app_config.color_conversion_strategy
or settings.OCR_COLOR_CONVERSION_STRATEGY
)
user_args = None
if app_config.user_args:
user_args = app_config.user_args
elif settings.OCR_USER_ARGS is not None: # pragma: no cover
try:
user_args = json.loads(settings.OCR_USER_ARGS)
except json.JSONDecodeError:
user_args = {}
self.user_args = user_args

View File

@ -0,0 +1,180 @@
# Generated by Django 4.2.7 on 2023-12-19 17:51
import django.core.validators
from django.db import migrations
from django.db import models
def _create_singleton(apps, schema_editor):
"""
Creates the first and only instance of the configuration model
"""
settings_model = apps.get_model("paperless", "ApplicationConfiguration")
settings_model.objects.create()
class Migration(migrations.Migration):
initial = True
dependencies = []
operations = [
migrations.CreateModel(
name="ApplicationConfiguration",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"output_type",
models.CharField(
blank=True,
choices=[
("pdf", "pdf"),
("pdfa", "pdfa"),
("pdfa-1", "pdfa-1"),
("pdfa-2", "pdfa-2"),
("pdfa-3", "pdfa-3"),
],
max_length=8,
null=True,
verbose_name="Sets the output PDF type",
),
),
(
"pages",
models.PositiveIntegerField(
null=True,
validators=[
django.core.validators.MinValueValidator(1),
],
verbose_name="Do OCR from page 1 to this value",
),
),
(
"language",
models.CharField(
blank=True,
max_length=32,
null=True,
verbose_name="Do OCR using these languages",
),
),
(
"mode",
models.CharField(
blank=True,
choices=[
("skip", "skip"),
("redo", "redo"),
("force", "force"),
("skip_noarchive", "skip_noarchive"),
],
max_length=16,
null=True,
verbose_name="Sets the OCR mode",
),
),
(
"skip_archive_file",
models.CharField(
blank=True,
choices=[
("never", "never"),
("with_text", "with_text"),
("always", "always"),
],
max_length=16,
null=True,
verbose_name="Controls the generation of an archive file",
),
),
(
"image_dpi",
models.PositiveIntegerField(
null=True,
validators=[
django.core.validators.MinValueValidator(1),
],
verbose_name="Sets image DPI fallback value",
),
),
(
"unpaper_clean",
models.CharField(
blank=True,
choices=[
("clean", "clean"),
("clean-final", "clean-final"),
("none", "none"),
],
max_length=16,
null=True,
verbose_name="Controls the unpaper cleaning",
),
),
(
"deskew",
models.BooleanField(null=True, verbose_name="Enables deskew"),
),
(
"rotate_pages",
models.BooleanField(
null=True,
verbose_name="Enables page rotation",
),
),
(
"rotate_pages_threshold",
models.FloatField(
null=True,
validators=[django.core.validators.MinValueValidator(0.0)],
verbose_name="Sets the threshold for rotation of pages",
),
),
(
"max_image_pixels",
models.FloatField(
null=True,
validators=[
django.core.validators.MinValueValidator(1000000.0),
],
verbose_name="Sets the maximum image size for decompression",
),
),
(
"color_conversion_strategy",
models.CharField(
blank=True,
choices=[
("LeaveColorUnchanged", "LeaveColorUnchanged"),
("RGB", "RGB"),
("UseDeviceIndependentColor", "UseDeviceIndependentColor"),
("Gray", "Gray"),
("CMYK", "CMYK"),
],
max_length=32,
null=True,
verbose_name="Sets the Ghostscript color conversion strategy",
),
),
(
"user_args",
models.JSONField(
null=True,
verbose_name="Adds additional user arguments for OCRMyPDF",
),
),
],
options={
"verbose_name": "paperless application settings",
},
),
migrations.RunPython(_create_singleton, migrations.RunPython.noop),
]

View File

173
src/paperless/models.py Normal file
View File

@ -0,0 +1,173 @@
from django.core.validators import MinValueValidator
from django.db import models
from django.utils.translation import gettext_lazy as _
DEFAULT_SINGLETON_INSTANCE_ID = 1
class AbstractSingletonModel(models.Model):
class Meta:
abstract = True
def save(self, *args, **kwargs):
"""
Always save as the first and only model
"""
self.pk = DEFAULT_SINGLETON_INSTANCE_ID
super().save(*args, **kwargs)
class OutputTypeChoices(models.TextChoices):
"""
Matches to --output-type
"""
PDF = ("pdf", _("pdf"))
PDF_A = ("pdfa", _("pdfa"))
PDF_A1 = ("pdfa-1", _("pdfa-1"))
PDF_A2 = ("pdfa-2", _("pdfa-2"))
PDF_A3 = ("pdfa-3", _("pdfa-3"))
class ModeChoices(models.TextChoices):
"""
Matches to --skip-text, --redo-ocr, --force-ocr
and our own custom setting
"""
SKIP = ("skip", _("skip"))
REDO = ("redo", _("redo"))
FORCE = ("force", _("force"))
SKIP_NO_ARCHIVE = ("skip_noarchive", _("skip_noarchive"))
class ArchiveFileChoices(models.TextChoices):
"""
Settings to control creation of an archive PDF file
"""
NEVER = ("never", _("never"))
WITH_TEXT = ("with_text", _("with_text"))
ALWAYS = ("always", _("always"))
class CleanChoices(models.TextChoices):
"""
Matches to --clean, --clean-final
"""
CLEAN = ("clean", _("clean"))
FINAL = ("clean-final", _("clean-final"))
NONE = ("none", _("none"))
class ColorConvertChoices(models.TextChoices):
"""
Refer to the Ghostscript documentation for valid options
"""
UNCHANGED = ("LeaveColorUnchanged", _("LeaveColorUnchanged"))
RGB = ("RGB", _("RGB"))
INDEPENDENT = ("UseDeviceIndependentColor", _("UseDeviceIndependentColor"))
GRAY = ("Gray", _("Gray"))
CMYK = ("CMYK", _("CMYK"))
class ApplicationConfiguration(AbstractSingletonModel):
"""
Settings which are common across more than 1 parser
"""
output_type = models.CharField(
verbose_name=_("Sets the output PDF type"),
null=True,
blank=True,
max_length=8,
choices=OutputTypeChoices.choices,
)
"""
Settings for the Tesseract based OCR parser
"""
pages = models.PositiveIntegerField(
verbose_name=_("Do OCR from page 1 to this value"),
null=True,
validators=[MinValueValidator(1)],
)
language = models.CharField(
verbose_name=_("Do OCR using these languages"),
null=True,
blank=True,
max_length=32,
)
mode = models.CharField(
verbose_name=_("Sets the OCR mode"),
null=True,
blank=True,
max_length=16,
choices=ModeChoices.choices,
)
skip_archive_file = models.CharField(
verbose_name=_("Controls the generation of an archive file"),
null=True,
blank=True,
max_length=16,
choices=ArchiveFileChoices.choices,
)
image_dpi = models.PositiveIntegerField(
verbose_name=_("Sets image DPI fallback value"),
null=True,
validators=[MinValueValidator(1)],
)
# Can't call it clean, that's a model method
unpaper_clean = models.CharField(
verbose_name=_("Controls the unpaper cleaning"),
null=True,
blank=True,
max_length=16,
choices=CleanChoices.choices,
)
deskew = models.BooleanField(verbose_name=_("Enables deskew"), null=True)
rotate_pages = models.BooleanField(
verbose_name=_("Enables page rotation"),
null=True,
)
rotate_pages_threshold = models.FloatField(
verbose_name=_("Sets the threshold for rotation of pages"),
null=True,
validators=[MinValueValidator(0.0)],
)
max_image_pixels = models.FloatField(
verbose_name=_("Sets the maximum image size for decompression"),
null=True,
validators=[MinValueValidator(1_000_000.0)],
)
color_conversion_strategy = models.CharField(
verbose_name=_("Sets the Ghostscript color conversion strategy"),
blank=True,
null=True,
max_length=32,
choices=ColorConvertChoices.choices,
)
user_args = models.JSONField(
verbose_name=_("Adds additional user arguments for OCRMyPDF"),
null=True,
)
class Meta:
verbose_name = _("paperless application settings")
def __str__(self) -> str: # pragma: no cover
return "ApplicationConfiguration"

View File

@ -3,6 +3,8 @@ from django.contrib.auth.models import Permission
from django.contrib.auth.models import User
from rest_framework import serializers
from paperless.models import ApplicationConfiguration
class ObfuscatedUserPasswordField(serializers.Field):
"""
@ -113,3 +115,9 @@ class ProfileSerializer(serializers.ModelSerializer):
"last_name",
"auth_token",
)
class ApplicationConfigurationSerializer(serializers.ModelSerializer):
class Meta:
model = ApplicationConfiguration
fields = "__all__"

View File

@ -57,6 +57,15 @@ def __get_int(key: str, default: int) -> int:
return int(os.getenv(key, default))
def __get_optional_int(key: str) -> Optional[int]:
"""
Returns None if the environment key is not present, otherwise an integer
"""
if key in os.environ:
return __get_int(key, -1) # pragma: no cover
return None
def __get_float(key: str, default: float) -> float:
"""
Return an integer value based on the environment variable or a default
@ -66,18 +75,24 @@ def __get_float(key: str, default: float) -> float:
def __get_path(
key: str,
default: Optional[Union[PathLike, str]] = None,
) -> Optional[Path]:
default: Union[PathLike, str],
) -> Path:
"""
Return a normalized, absolute path based on the environment variable or a default,
if provided. If not set and no default, returns None
if provided
"""
if key in os.environ:
return Path(os.environ[key]).resolve()
elif default is not None:
return Path(default).resolve()
else:
return None
return Path(default).resolve()
def __get_optional_path(key: str) -> Optional[Path]:
"""
Returns None if the environment key is not present, otherwise a fully resolved Path
"""
if key in os.environ:
return __get_path(key, "")
return None
def __get_list(
@ -327,7 +342,7 @@ MIDDLEWARE = [
]
# Optional to enable compression
if __get_boolean("PAPERLESS_ENABLE_COMPRESSION", "yes"): # pragma: nocover
if __get_boolean("PAPERLESS_ENABLE_COMPRESSION", "yes"): # pragma: no cover
MIDDLEWARE.insert(0, "compression_middleware.middleware.CompressionMiddleware")
ROOT_URLCONF = "paperless.urls"
@ -495,7 +510,7 @@ CSRF_COOKIE_NAME = f"{COOKIE_PREFIX}csrftoken"
SESSION_COOKIE_NAME = f"{COOKIE_PREFIX}sessionid"
LANGUAGE_COOKIE_NAME = f"{COOKIE_PREFIX}django_language"
EMAIL_CERTIFICATE_FILE = __get_path("PAPERLESS_EMAIL_CERTIFICATE_LOCATION")
EMAIL_CERTIFICATE_FILE = __get_optional_path("PAPERLESS_EMAIL_CERTIFICATE_LOCATION")
###############################################################################
@ -796,11 +811,10 @@ CONSUMER_BARCODE_STRING: Final[str] = os.getenv(
"PATCHT",
)
consumer_barcode_scanner_tmp: Final[str] = os.getenv(
CONSUMER_BARCODE_SCANNER: Final[str] = os.getenv(
"PAPERLESS_CONSUMER_BARCODE_SCANNER",
"PYZBAR",
)
CONSUMER_BARCODE_SCANNER = consumer_barcode_scanner_tmp.upper()
).upper()
CONSUMER_ENABLE_ASN_BARCODE: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_ENABLE_ASN_BARCODE",
@ -811,15 +825,12 @@ CONSUMER_ASN_BARCODE_PREFIX: Final[str] = os.getenv(
"ASN",
)
CONSUMER_BARCODE_UPSCALE: Final[float] = float(
os.getenv("PAPERLESS_CONSUMER_BARCODE_UPSCALE", 0.0),
CONSUMER_BARCODE_UPSCALE: Final[float] = __get_float(
"PAPERLESS_CONSUMER_BARCODE_UPSCALE",
0.0,
)
CONSUMER_BARCODE_DPI: Final[str] = int(
os.getenv("PAPERLESS_CONSUMER_BARCODE_DPI", 300),
)
CONSUMER_BARCODE_DPI: Final[int] = __get_int("PAPERLESS_CONSUMER_BARCODE_DPI", 300)
CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED",
@ -834,7 +845,7 @@ CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT",
)
OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0))
OCR_PAGES = __get_optional_int("PAPERLESS_OCR_PAGES")
# The default language that tesseract will attempt to use when parsing
# documents. It should be a 3-letter language code consistent with ISO 639.
@ -848,28 +859,29 @@ OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
OCR_SKIP_ARCHIVE_FILE = os.getenv("PAPERLESS_OCR_SKIP_ARCHIVE_FILE", "never")
OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
OCR_IMAGE_DPI = __get_optional_int("PAPERLESS_OCR_IMAGE_DPI")
OCR_CLEAN = os.getenv("PAPERLESS_OCR_CLEAN", "clean")
OCR_DESKEW = __get_boolean("PAPERLESS_OCR_DESKEW", "true")
OCR_DESKEW: Final[bool] = __get_boolean("PAPERLESS_OCR_DESKEW", "true")
OCR_ROTATE_PAGES = __get_boolean("PAPERLESS_OCR_ROTATE_PAGES", "true")
OCR_ROTATE_PAGES: Final[bool] = __get_boolean("PAPERLESS_OCR_ROTATE_PAGES", "true")
OCR_ROTATE_PAGES_THRESHOLD = float(
os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 12.0),
OCR_ROTATE_PAGES_THRESHOLD: Final[float] = __get_float(
"PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD",
12.0,
)
OCR_MAX_IMAGE_PIXELS: Optional[int] = None
if os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS") is not None:
OCR_MAX_IMAGE_PIXELS: int = int(os.environ.get("PAPERLESS_OCR_MAX_IMAGE_PIXELS"))
OCR_MAX_IMAGE_PIXELS: Final[Optional[int]] = __get_optional_int(
"PAPERLESS_OCR_MAX_IMAGE_PIXELS",
)
OCR_COLOR_CONVERSION_STRATEGY = os.getenv(
"PAPERLESS_OCR_COLOR_CONVERSION_STRATEGY",
"RGB",
)
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}")
OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS")
# GNUPG needs a home directory for some reason
GNUPG_HOME = os.getenv("HOME", "/tmp")

View File

@ -35,6 +35,7 @@ from documents.views import TasksViewSet
from documents.views import UiSettingsView
from documents.views import UnifiedSearchViewSet
from paperless.consumers import StatusConsumer
from paperless.views import ApplicationConfigurationViewSet
from paperless.views import FaviconView
from paperless.views import GenerateAuthTokenView
from paperless.views import GroupViewSet
@ -60,6 +61,7 @@ api_router.register(r"mail_rules", MailRuleViewSet)
api_router.register(r"share_links", ShareLinkViewSet)
api_router.register(r"consumption_templates", ConsumptionTemplateViewSet)
api_router.register(r"custom_fields", CustomFieldViewSet)
api_router.register(r"config", ApplicationConfigurationViewSet)
urlpatterns = [

View File

@ -18,6 +18,8 @@ from rest_framework.viewsets import ModelViewSet
from documents.permissions import PaperlessObjectPermissions
from paperless.filters import GroupFilterSet
from paperless.filters import UserFilterSet
from paperless.models import ApplicationConfiguration
from paperless.serialisers import ApplicationConfigurationSerializer
from paperless.serialisers import GroupSerializer
from paperless.serialisers import ProfileSerializer
from paperless.serialisers import UserSerializer
@ -71,7 +73,7 @@ class StandardPagination(PageNumberPagination):
class FaviconView(View):
def get(self, request, *args, **kwargs): # pragma: nocover
def get(self, request, *args, **kwargs): # pragma: no cover
favicon = os.path.join(
os.path.dirname(__file__),
"static",
@ -160,3 +162,12 @@ class GenerateAuthTokenView(GenericAPIView):
return Response(
token.key,
)
class ApplicationConfigurationViewSet(ModelViewSet):
model = ApplicationConfiguration
queryset = ApplicationConfiguration.objects
serializer_class = ApplicationConfigurationSerializer
permission_classes = (IsAuthenticated,)

View File

@ -92,7 +92,7 @@ class BaseMailAction:
M: MailBox,
message_uid: str,
parameter: str,
): # pragma: nocover
): # pragma: no cover
"""
Perform mail action on the given mail uid in the mailbox.
"""
@ -171,7 +171,7 @@ class TagMailAction(BaseMailAction):
return AND(NOT(gmail_label=self.keyword), no_keyword=self.keyword)
else:
return {"no_keyword": self.keyword}
else: # pragma: nocover
else: # pragma: no cover
raise ValueError("This should never happen.")
def post_consume(self, M: MailBox, message_uid: str, parameter: str):
@ -361,7 +361,7 @@ def get_rule_action(rule: MailRule, supports_gmail_labels: bool) -> BaseMailActi
elif rule.action == MailRule.MailAction.TAG:
return TagMailAction(rule.action_parameter, supports_gmail_labels)
else:
raise NotImplementedError("Unknown action.") # pragma: nocover
raise NotImplementedError("Unknown action.") # pragma: no cover
def make_criterias(rule: MailRule, supports_gmail_labels: bool):
@ -397,7 +397,7 @@ def get_mailbox(server, port, security) -> MailBox:
Returns the correct MailBox instance for the given configuration.
"""
ssl_context = ssl.create_default_context()
if settings.EMAIL_CERTIFICATE_FILE is not None: # pragma: nocover
if settings.EMAIL_CERTIFICATE_FILE is not None: # pragma: no cover
ssl_context.load_verify_locations(cafile=settings.EMAIL_CERTIFICATE_FILE)
if security == MailAccount.ImapSecurity.NONE:
@ -407,7 +407,7 @@ def get_mailbox(server, port, security) -> MailBox:
elif security == MailAccount.ImapSecurity.SSL:
mailbox = MailBox(server, port, ssl_context=ssl_context)
else:
raise NotImplementedError("Unknown IMAP security") # pragma: nocover
raise NotImplementedError("Unknown IMAP security") # pragma: no cover
return mailbox
@ -450,7 +450,7 @@ class MailAccountHandler(LoggingMixin):
else:
raise NotImplementedError(
"Unknown title selector.",
) # pragma: nocover
) # pragma: no cover
def _get_correspondent(
self,
@ -478,7 +478,7 @@ class MailAccountHandler(LoggingMixin):
else:
raise NotImplementedError(
"Unknown correspondent selector",
) # pragma: nocover
) # pragma: no cover
def handle_mail_account(self, account: MailAccount):
"""

View File

@ -405,3 +405,9 @@ class MailDocumentParser(DocumentParser):
html_pdf = tempdir / "html.pdf"
html_pdf.write_bytes(response.content)
return html_pdf
def get_settings(self):
"""
This parser does not implement additional settings yet
"""
return None

View File

@ -1,9 +1,9 @@
import json
import os
import re
import subprocess
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING
from typing import Optional
from django.conf import settings
@ -12,6 +12,10 @@ from PIL import Image
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from paperless.config import OcrConfig
from paperless.models import ArchiveFileChoices
from paperless.models import CleanChoices
from paperless.models import ModeChoices
class NoTextFoundException(Exception):
@ -30,6 +34,12 @@ class RasterisedDocumentParser(DocumentParser):
logging_name = "paperless.parsing.tesseract"
def get_settings(self) -> OcrConfig:
"""
This parser uses the OCR configuration settings to parse documents
"""
return OcrConfig()
def extract_metadata(self, document_path, mime_type):
result = []
if mime_type == "application/pdf":
@ -66,7 +76,7 @@ class RasterisedDocumentParser(DocumentParser):
self.logging_group,
)
def is_image(self, mime_type):
def is_image(self, mime_type) -> bool:
return mime_type in [
"image/png",
"image/jpeg",
@ -76,7 +86,7 @@ class RasterisedDocumentParser(DocumentParser):
"image/webp",
]
def has_alpha(self, image):
def has_alpha(self, image) -> bool:
with Image.open(image) as im:
return im.mode in ("RGBA", "LA")
@ -91,7 +101,7 @@ class RasterisedDocumentParser(DocumentParser):
],
)
def get_dpi(self, image):
def get_dpi(self, image) -> Optional[int]:
try:
with Image.open(image) as im:
x, y = im.info["dpi"]
@ -100,7 +110,7 @@ class RasterisedDocumentParser(DocumentParser):
self.log.warning(f"Error while getting DPI from image {image}: {e}")
return None
def calculate_a4_dpi(self, image):
def calculate_a4_dpi(self, image) -> Optional[int]:
try:
with Image.open(image) as im:
width, height = im.size
@ -113,13 +123,17 @@ class RasterisedDocumentParser(DocumentParser):
self.log.warning(f"Error while calculating DPI for image {image}: {e}")
return None
def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path):
def extract_text(
self,
sidecar_file: Optional[Path],
pdf_file: Path,
) -> Optional[str]:
# When re-doing OCR, the sidecar contains ONLY the new text, not
# the whole text, so do not utilize it in that case
if (
sidecar_file is not None
and os.path.isfile(sidecar_file)
and settings.OCR_MODE != "redo"
and self.settings.mode != "redo"
):
text = self.read_file_handle_unicode_errors(sidecar_file)
@ -174,6 +188,8 @@ class RasterisedDocumentParser(DocumentParser):
sidecar_file,
safe_fallback=False,
):
if TYPE_CHECKING:
assert isinstance(self.settings, OcrConfig)
ocrmypdf_args = {
"input_file": input_file,
"output_file": output_file,
@ -181,46 +197,47 @@ class RasterisedDocumentParser(DocumentParser):
# processes via the task library.
"use_threads": True,
"jobs": settings.THREADS_PER_WORKER,
"language": settings.OCR_LANGUAGE,
"output_type": settings.OCR_OUTPUT_TYPE,
"language": self.settings.language,
"output_type": self.settings.output_type,
"progress_bar": False,
}
if "pdfa" in ocrmypdf_args["output_type"]:
ocrmypdf_args[
"color_conversion_strategy"
] = settings.OCR_COLOR_CONVERSION_STRATEGY
] = self.settings.color_conversion_strategy
if settings.OCR_MODE == "force" or safe_fallback:
if self.settings.mode == ModeChoices.FORCE or safe_fallback:
ocrmypdf_args["force_ocr"] = True
elif settings.OCR_MODE in ["skip", "skip_noarchive"]:
elif self.settings.mode in {
ModeChoices.SKIP,
ModeChoices.SKIP_NO_ARCHIVE,
}:
ocrmypdf_args["skip_text"] = True
elif settings.OCR_MODE == "redo":
elif self.settings.mode == ModeChoices.REDO:
ocrmypdf_args["redo_ocr"] = True
else:
raise ParseError(f"Invalid ocr mode: {settings.OCR_MODE}")
else: # pragma: no cover
raise ParseError(f"Invalid ocr mode: {self.settings.mode}")
if settings.OCR_CLEAN == "clean":
if self.settings.clean == CleanChoices.CLEAN:
ocrmypdf_args["clean"] = True
elif settings.OCR_CLEAN == "clean-final":
if settings.OCR_MODE == "redo":
elif self.settings.clean == CleanChoices.FINAL:
if self.settings.mode == ModeChoices.REDO:
ocrmypdf_args["clean"] = True
else:
# --clean-final is not compatible with --redo-ocr
ocrmypdf_args["clean_final"] = True
if settings.OCR_DESKEW and settings.OCR_MODE != "redo":
if self.settings.deskew and self.settings.mode != ModeChoices.REDO:
# --deskew is not compatible with --redo-ocr
ocrmypdf_args["deskew"] = True
if settings.OCR_ROTATE_PAGES:
if self.settings.rotate:
ocrmypdf_args["rotate_pages"] = True
ocrmypdf_args[
"rotate_pages_threshold"
] = settings.OCR_ROTATE_PAGES_THRESHOLD
ocrmypdf_args["rotate_pages_threshold"] = self.settings.rotate_threshold
if settings.OCR_PAGES > 0:
ocrmypdf_args["pages"] = f"1-{settings.OCR_PAGES}"
if self.settings.pages is not None:
ocrmypdf_args["pages"] = f"1-{self.settings.pages}"
else:
# sidecar is incompatible with pages
ocrmypdf_args["sidecar"] = sidecar_file
@ -239,8 +256,8 @@ class RasterisedDocumentParser(DocumentParser):
if dpi:
self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
ocrmypdf_args["image_dpi"] = dpi
elif settings.OCR_IMAGE_DPI:
ocrmypdf_args["image_dpi"] = settings.OCR_IMAGE_DPI
elif self.settings.image_dpi is not None:
ocrmypdf_args["image_dpi"] = self.settings.image_dpi
elif a4_dpi:
ocrmypdf_args["image_dpi"] = a4_dpi
else:
@ -254,19 +271,18 @@ class RasterisedDocumentParser(DocumentParser):
f"Image DPI of {ocrmypdf_args['image_dpi']} is low, OCR may fail",
)
if settings.OCR_USER_ARGS:
if self.settings.user_args is not None:
try:
user_args = json.loads(settings.OCR_USER_ARGS)
ocrmypdf_args = {**ocrmypdf_args, **user_args}
ocrmypdf_args = {**ocrmypdf_args, **self.settings.user_args}
except Exception as e:
self.log.warning(
f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
f"they will not be used. Error: {e}",
)
if settings.OCR_MAX_IMAGE_PIXELS is not None:
if self.settings.max_image_pixel is not None:
# Convert pixels to mega-pixels and provide to ocrmypdf
max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0
max_pixels_mpixels = self.settings.max_image_pixel / 1_000_000.0
if max_pixels_mpixels > 0:
self.log.debug(
f"Calculated {max_pixels_mpixels} megapixels for OCR",
@ -298,8 +314,12 @@ class RasterisedDocumentParser(DocumentParser):
# If the original has text, and the user doesn't want an archive,
# we're done here
skip_archive_for_text = (
settings.OCR_MODE == "skip_noarchive"
or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"]
self.settings.mode == ModeChoices.SKIP_NO_ARCHIVE
or self.settings.skip_archive_file
in {
ArchiveFileChoices.WITH_TEXT,
ArchiveFileChoices.ALWAYS,
}
)
if skip_archive_for_text and original_has_text:
self.log.debug("Document has text, skipping OCRmyPDF entirely.")
@ -329,7 +349,7 @@ class RasterisedDocumentParser(DocumentParser):
self.log.debug(f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
if settings.OCR_SKIP_ARCHIVE_FILE != "always":
if self.settings.skip_archive_file != ArchiveFileChoices.ALWAYS:
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path)

View File

@ -2,7 +2,6 @@ import os
import shutil
import tempfile
import uuid
from contextlib import AbstractContextManager
from pathlib import Path
from unittest import mock
@ -17,28 +16,6 @@ from documents.tests.utils import FileSystemAssertsMixin
from paperless_tesseract.parsers import RasterisedDocumentParser
from paperless_tesseract.parsers import post_process_text
image_to_string_calls = []
def fake_convert(input_file, output_file, **kwargs):
with open(input_file) as f:
lines = f.readlines()
for i, line in enumerate(lines):
with open(output_file % i, "w") as f2:
f2.write(line.strip())
class FakeImageFile(AbstractContextManager):
def __init__(self, fname):
self.fname = fname
def __exit__(self, exc_type, exc_val, exc_tb):
pass
def __enter__(self):
return os.path.basename(self.fname)
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
@ -769,43 +746,52 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertEqual(params["sidecar"], "sidecar.txt")
with override_settings(OCR_CLEAN="none"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertNotIn("clean", params)
self.assertNotIn("clean_final", params)
with override_settings(OCR_CLEAN="clean"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertTrue(params["clean"])
self.assertNotIn("clean_final", params)
with override_settings(OCR_CLEAN="clean-final", OCR_MODE="skip"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertTrue(params["clean_final"])
self.assertNotIn("clean", params)
with override_settings(OCR_CLEAN="clean-final", OCR_MODE="redo"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertTrue(params["clean"])
self.assertNotIn("clean_final", params)
with override_settings(OCR_DESKEW=True, OCR_MODE="skip"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertTrue(params["deskew"])
with override_settings(OCR_DESKEW=True, OCR_MODE="redo"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertNotIn("deskew", params)
with override_settings(OCR_DESKEW=False, OCR_MODE="skip"):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertNotIn("deskew", params)
with override_settings(OCR_MAX_IMAGE_PIXELS=1_000_001.0):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertIn("max_image_mpixels", params)
self.assertAlmostEqual(params["max_image_mpixels"], 1, places=4)
with override_settings(OCR_MAX_IMAGE_PIXELS=-1_000_001.0):
parser = RasterisedDocumentParser(None)
params = parser.construct_ocrmypdf_parameters("", "", "", "")
self.assertNotIn("max_image_mpixels", params)

View File

@ -0,0 +1,232 @@
import json
from django.test import TestCase
from django.test import override_settings
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless.models import ApplicationConfiguration
from paperless.models import CleanChoices
from paperless.models import ColorConvertChoices
from paperless.models import ModeChoices
from paperless.models import OutputTypeChoices
from paperless_tesseract.parsers import RasterisedDocumentParser
class TestParserSettingsFromDb(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
@staticmethod
def get_params():
"""
Helper to get just the OCRMyPDF parameters from the parser
"""
return RasterisedDocumentParser(None).construct_ocrmypdf_parameters(
input_file="input.pdf",
output_file="output.pdf",
sidecar_file="sidecar.txt",
mime_type="application/pdf",
safe_fallback=False,
)
def test_db_settings_ocr_pages(self):
"""
GIVEN:
- Django settings defines different value for OCR_PAGES than
configuration object
WHEN:
- OCR parameters are constructed
THEN:
- Configuration from database is utilized
"""
with override_settings(OCR_PAGES=10):
instance = ApplicationConfiguration.objects.all().first()
instance.pages = 5
instance.save()
params = self.get_params()
self.assertEqual(params["pages"], "1-5")
def test_db_settings_ocr_language(self):
"""
GIVEN:
- Django settings defines different value for OCR_LANGUAGE than
configuration object
WHEN:
- OCR parameters are constructed
THEN:
- Configuration from database is utilized
"""
with override_settings(OCR_LANGUAGE="eng+deu"):
instance = ApplicationConfiguration.objects.all().first()
instance.language = "fra+ita"
instance.save()
params = self.get_params()
self.assertEqual(params["language"], "fra+ita")
def test_db_settings_ocr_output_type(self):
"""
GIVEN:
- Django settings defines different value for OCR_OUTPUT_TYPE than
configuration object
WHEN:
- OCR parameters are constructed
THEN:
- Configuration from database is utilized
"""
with override_settings(OCR_OUTPUT_TYPE="pdfa-3"):
instance = ApplicationConfiguration.objects.all().first()
instance.output_type = OutputTypeChoices.PDF_A
instance.save()
params = self.get_params()
self.assertEqual(params["output_type"], "pdfa")
def test_db_settings_ocr_mode(self):
"""
GIVEN:
- Django settings defines different value for OCR_MODE than
configuration object
WHEN:
- OCR parameters are constructed
THEN:
- Configuration from database is utilized
"""
with override_settings(OCR_MODE="redo"):
instance = ApplicationConfiguration.objects.all().first()
instance.mode = ModeChoices.SKIP
instance.save()
params = self.get_params()
self.assertTrue(params["skip_text"])
self.assertNotIn("redo_ocr", params)
self.assertNotIn("force_ocr", params)
def test_db_settings_ocr_clean(self):
"""
GIVEN:
- Django settings defines different value for OCR_CLEAN than
configuration object
WHEN:
- OCR parameters are constructed
THEN:
- Configuration from database is utilized
"""
with override_settings(OCR_CLEAN="clean-final"):
instance = ApplicationConfiguration.objects.all().first()
instance.unpaper_clean = CleanChoices.CLEAN
instance.save()
params = self.get_params()
self.assertTrue(params["clean"])
self.assertNotIn("clean_final", params)
with override_settings(OCR_CLEAN="clean-final"):
instance = ApplicationConfiguration.objects.all().first()
instance.unpaper_clean = CleanChoices.FINAL
instance.save()
params = self.get_params()
self.assertTrue(params["clean_final"])
self.assertNotIn("clean", params)
def test_db_settings_ocr_deskew(self):
"""
GIVEN:
- Django settings defines different value for OCR_DESKEW than
configuration object
WHEN:
- OCR parameters are constructed
THEN:
- Configuration from database is utilized
"""
with override_settings(OCR_DESKEW=False):
instance = ApplicationConfiguration.objects.all().first()
instance.deskew = True
instance.save()
params = self.get_params()
self.assertTrue(params["deskew"])
def test_db_settings_ocr_rotate(self):
"""
GIVEN:
- Django settings defines different value for OCR_ROTATE_PAGES
and OCR_ROTATE_PAGES_THRESHOLD than configuration object
WHEN:
- OCR parameters are constructed
THEN:
- Configuration from database is utilized
"""
with override_settings(OCR_ROTATE_PAGES=False, OCR_ROTATE_PAGES_THRESHOLD=30.0):
instance = ApplicationConfiguration.objects.all().first()
instance.rotate_pages = True
instance.rotate_pages_threshold = 15.0
instance.save()
params = self.get_params()
self.assertTrue(params["rotate_pages"])
self.assertAlmostEqual(params["rotate_pages_threshold"], 15.0)
def test_db_settings_ocr_max_pixels(self):
"""
GIVEN:
- Django settings defines different value for OCR_MAX_IMAGE_PIXELS than
configuration object
WHEN:
- OCR parameters are constructed
THEN:
- Configuration from database is utilized
"""
with override_settings(OCR_MAX_IMAGE_PIXELS=2_000_000.0):
instance = ApplicationConfiguration.objects.all().first()
instance.max_image_pixels = 1_000_000.0
instance.save()
params = self.get_params()
self.assertAlmostEqual(params["max_image_mpixels"], 1.0)
def test_db_settings_ocr_color_convert(self):
"""
GIVEN:
- Django settings defines different value for OCR_COLOR_CONVERSION_STRATEGY than
configuration object
WHEN:
- OCR parameters are constructed
THEN:
- Configuration from database is utilized
"""
with override_settings(OCR_COLOR_CONVERSION_STRATEGY="LeaveColorUnchanged"):
instance = ApplicationConfiguration.objects.all().first()
instance.color_conversion_strategy = ColorConvertChoices.INDEPENDENT
instance.save()
params = self.get_params()
self.assertEqual(
params["color_conversion_strategy"],
"UseDeviceIndependentColor",
)
def test_ocr_user_args(self):
"""
GIVEN:
- Django settings defines different value for OCR_USER_ARGS than
configuration object
WHEN:
- OCR parameters are constructed
THEN:
- Configuration from database is utilized
"""
with override_settings(
OCR_USER_ARGS=json.dumps({"continue_on_soft_render_error": True}),
):
instance = ApplicationConfiguration.objects.all().first()
instance.user_args = {"unpaper_args": "--pre-rotate 90"}
instance.save()
params = self.get_params()
self.assertIn("unpaper_args", params)
self.assertEqual(
params["unpaper_args"],
"--pre-rotate 90",
)

View File

@ -34,3 +34,9 @@ class TextDocumentParser(DocumentParser):
def parse(self, document_path, mime_type, file_name=None):
self.text = self.read_file_handle_unicode_errors(document_path)
def get_settings(self):
"""
This parser does not implement additional settings yet
"""
return None

View File

@ -10,6 +10,8 @@ from tika_client import TikaClient
from documents.parsers import DocumentParser
from documents.parsers import ParseError
from documents.parsers import make_thumbnail_from_pdf
from paperless.config import OutputTypeConfig
from paperless.models import OutputTypeChoices
class TikaDocumentParser(DocumentParser):
@ -63,7 +65,7 @@ class TikaDocumentParser(DocumentParser):
document_path.read_bytes(),
mime_type,
)
else: # pragma: nocover
else: # pragma: no cover
raise
except Exception as err:
raise ParseError(
@ -91,11 +93,14 @@ class TikaDocumentParser(DocumentParser):
timeout=settings.CELERY_TASK_TIME_LIMIT,
) as client, client.libre_office.to_pdf() as route:
# Set the output format of the resulting PDF
if settings.OCR_OUTPUT_TYPE in {"pdfa", "pdfa-2"}:
if settings.OCR_OUTPUT_TYPE in {
OutputTypeChoices.PDF_A,
OutputTypeChoices.PDF_A2,
}:
route.pdf_format(PdfAFormat.A2b)
elif settings.OCR_OUTPUT_TYPE == "pdfa-1":
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A1:
route.pdf_format(PdfAFormat.A1a)
elif settings.OCR_OUTPUT_TYPE == "pdfa-3":
elif settings.OCR_OUTPUT_TYPE == OutputTypeChoices.PDF_A3:
route.pdf_format(PdfAFormat.A3b)
route.convert(document_path)
@ -111,3 +116,9 @@ class TikaDocumentParser(DocumentParser):
raise ParseError(
f"Error while converting document to PDF: {err}",
) from err
def get_settings(self) -> OutputTypeConfig:
"""
This parser only uses the PDF output type configuration currently
"""
return OutputTypeConfig()

View File

@ -18,6 +18,7 @@ omit =
exclude_also =
if settings.AUDIT_LOG_ENABLED:
if AUDIT_LOG_ENABLED:
if TYPE_CHECKING:
[mypy]
plugins = mypy_django_plugin.main, mypy_drf_plugin.main, numpy.typing.mypy_plugin