Compare commits

..

6 Commits

Author SHA1 Message Date
shamoon
d960aa2699 Use output_content_format poller.result to get clean content 2025-06-17 14:51:49 -07:00
shamoon
0fd6d40b37 Some docs 2025-06-17 14:51:49 -07:00
shamoon
c9f724d417 Test 2025-06-17 14:51:48 -07:00
shamoon
4b2c986cb3 This actually works
[ci skip]
2025-06-17 14:51:48 -07:00
shamoon
e2a2705d23 Basic parse 2025-06-17 14:51:48 -07:00
shamoon
ea63481cd4 Ok, restart implementing this with just azure
[ci skip]
2025-06-17 14:51:45 -07:00
23 changed files with 547 additions and 225 deletions

View File

@@ -13,8 +13,8 @@ echo $(date +%s) > /var/run/s6/container_environment/PAPERLESS_START_TIME_S
# Check if we're starting as a non-root user # Check if we're starting as a non-root user
if [ "$(id --user)" != "0" ]; then if [ "$(id --user)" != "0" ]; then
printf "true" > /var/run/s6/container_environment/USER_IS_NON_ROOT printf "true" > /var/run/s6/container_environment/USER_IS_NON_ROOT
echo "${log_prefix} paperless-ngx docker container running under a user ($(id --user):$(id --group))" echo "${log_prefix} paperless-ngx docker container running under a user ($(id --user):$(id --group))"
else else
printf "/usr/src/paperless" > /var/run/s6/container_environment/HOME printf "/usr/src/paperless" > /var/run/s6/container_environment/HOME
echo "${log_prefix} paperless-ngx docker container starting init as root" echo "${log_prefix} paperless-ngx docker container starting init as root"
fi fi

View File

@@ -1708,3 +1708,23 @@ password. All of these options come from their similarly-named [Django settings]
#### [`PAPERLESS_EMAIL_USE_SSL=<bool>`](#PAPERLESS_EMAIL_USE_SSL) {#PAPERLESS_EMAIL_USE_SSL} #### [`PAPERLESS_EMAIL_USE_SSL=<bool>`](#PAPERLESS_EMAIL_USE_SSL) {#PAPERLESS_EMAIL_USE_SSL}
: Defaults to false. : Defaults to false.
## Remote OCR
#### [`PAPERLESS_REMOTE_OCR_ENGINE=<str>`](#PAPERLESS_REMOTE_OCR_ENGINE) {#PAPERLESS_REMOTE_OCR_ENGINE}
: The remote OCR engine to use. Currently only Azure AI is supported as "azureai".
Defaults to None, which disables remote OCR.
#### [`PAPERLESS_REMOTE_OCR_API_KEY=<str>`](#PAPERLESS_REMOTE_OCR_API_KEY) {#PAPERLESS_REMOTE_OCR_API_KEY}
: The API key to use for the remote OCR engine.
Defaults to None.
#### [`PAPERLESS_REMOTE_OCR_ENDPOINT=<str>`](#PAPERLESS_REMOTE_OCR_ENDPOINT) {#PAPERLESS_REMOTE_OCR_ENDPOINT}
: The endpoint to use for the remote OCR engine. This is required for Azure AI.
Defaults to None.

View File

@@ -841,6 +841,18 @@ how regularly you intend to scan documents and use paperless.
performed the task associated with the document, move it to the performed the task associated with the document, move it to the
inbox. inbox.
## Remove OCR
!!! important
This feature is disabled by default and will always remain strictly "opt-in".
Paperless-ngx supports performing OCR on documents using remote services. At the moment, this is limited to
[Microsoft's Azure "Document Intelligence" service](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence).
This is of course a paid service (with a free tier) which requires an Azure account and subscription. Azure AI is not affiliated with
Paperless-ngx in any way. When enabled, Paperless-ngx will automatically send appropriate documents to Azure for OCR processing, bypassing
the local OCR engine. See the [configuration](configuration.md#PAPERLESS_REMOTE_OCR_ENGINE) options for more details.
## Architecture ## Architecture
Paperless-ngx consists of the following components: Paperless-ngx consists of the following components:

View File

@@ -15,6 +15,7 @@ classifiers = [
# This will allow testing to not install a webserver, mysql, etc # This will allow testing to not install a webserver, mysql, etc
dependencies = [ dependencies = [
"azure-ai-documentintelligence>=1.0.2",
"bleach~=6.2.0", "bleach~=6.2.0",
"celery[redis]~=5.5.1", "celery[redis]~=5.5.1",
"channels~=4.2", "channels~=4.2",
@@ -221,12 +222,22 @@ lint.per-file-ignores."src/documents/parsers.py" = [
lint.per-file-ignores."src/documents/signals/handlers.py" = [ lint.per-file-ignores."src/documents/signals/handlers.py" = [
"PTH", "PTH",
] # TODO Enable & remove ] # TODO Enable & remove
lint.per-file-ignores."src/documents/views.py" = [
"PTH",
] # TODO Enable & remove
lint.per-file-ignores."src/paperless/checks.py" = [
"PTH",
] # TODO Enable & remove
lint.per-file-ignores."src/paperless/settings.py" = [ lint.per-file-ignores."src/paperless/settings.py" = [
"PTH", "PTH",
] # TODO Enable & remove ] # TODO Enable & remove
lint.per-file-ignores."src/paperless_mail/mail.py" = [
"PTH",
] # TODO Enable & remove
lint.per-file-ignores."src/paperless_tesseract/tests/test_parser.py" = [ lint.per-file-ignores."src/paperless_tesseract/tests/test_parser.py" = [
"PTH",
"RUF001", "RUF001",
] ] # TODO PTH Enable & remove
lint.isort.force-single-line = true lint.isort.force-single-line = true
[tool.pytest.ini_options] [tool.pytest.ini_options]

View File

@@ -158,13 +158,10 @@
</div> </div>
<div class="nav-group mt-3 mb-1"> <div class="nav-group mt-3 mb-1">
<h6 class="sidebar-heading px-3 text-muted d-flex align-items-center"> <h6 class="sidebar-heading px-3 text-muted">
<span i18n>Manage</span> <span i18n>Manage</span>
<button class="btn btn-link p-2 py-0" (click)="manageCollapse.toggle()">
<i-bs width="0.9em" height="0.9em" [name]="isManageMenuCollapsed ? 'chevron-down' : 'chevron-up'"></i-bs>
</button>
</h6> </h6>
<ul class="nav flex-column mb-2" #manageCollapse="ngbCollapse" [(ngbCollapse)]="isManageMenuCollapsed"> <ul class="nav flex-column mb-2">
<li class="nav-item app-link" <li class="nav-item app-link"
*pngxIfPermissions="{ action: PermissionAction.View, type: PermissionType.Correspondent }"> *pngxIfPermissions="{ action: PermissionAction.View, type: PermissionType.Correspondent }">
<a class="nav-link" routerLink="correspondents" routerLinkActive="active" (click)="closeMenu()" <a class="nav-link" routerLink="correspondents" routerLinkActive="active" (click)="closeMenu()"
@@ -238,124 +235,117 @@
</div> </div>
<div class="nav-group mt-auto mb-1"> <div class="nav-group mt-auto mb-1">
<h6 class="sidebar-heading px-3 pt-4 text-muted d-flex align-items-center"> <h6 class="sidebar-heading px-3 pt-4 text-muted">
<span i18n>Administration</span> <span i18n>Administration</span>
<button class="btn btn-link p-2 py-0" (click)="adminCollapse.toggle()">
<i-bs width="0.9em" height="0.9em" [name]="isAdminMenuCollapsed ? 'chevron-down' : 'chevron-up'"></i-bs>
</button>
</h6> </h6>
<div class="mb-2"> <ul class="nav flex-column mb-2">
<ul class="nav flex-column" #adminCollapse="ngbCollapse" [(ngbCollapse)]="isAdminMenuCollapsed"> <li class="nav-item app-link" *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.UISettings }"
<li class="nav-item app-link" *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.UISettings }" tourAnchor="tour.settings">
tourAnchor="tour.settings"> <a class="nav-link" routerLink="settings" routerLinkActive="active" (click)="closeMenu()"
<a class="nav-link" routerLink="settings" routerLinkActive="active" (click)="closeMenu()" ngbPopover="Settings" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end"
ngbPopover="Settings" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end" container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim">
container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim"> <i-bs class="me-1" name="gear"></i-bs><span>&nbsp;<ng-container i18n>Settings</ng-container></span>
<i-bs class="me-1" name="gear"></i-bs><span>&nbsp;<ng-container i18n>Settings</ng-container></span> </a>
</a> </li>
</li> <li class="nav-item app-link" *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.AppConfig }">
<li class="nav-item app-link" *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.AppConfig }"> <a class="nav-link" routerLink="config" routerLinkActive="active" (click)="closeMenu()"
<a class="nav-link" routerLink="config" routerLinkActive="active" (click)="closeMenu()" ngbPopover="Configuration" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end"
ngbPopover="Configuration" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end" container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim">
container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim"> <i-bs class="me-1" name="sliders2-vertical"></i-bs><span>&nbsp;<ng-container i18n>Configuration</ng-container></span>
<i-bs class="me-1" name="sliders2-vertical"></i-bs><span>&nbsp;<ng-container i18n>Configuration</ng-container></span> </a>
</a> </li>
</li> <li class="nav-item app-link" *pngxIfPermissions="{ action: PermissionAction.View, type: PermissionType.User }">
<li class="nav-item app-link" *pngxIfPermissions="{ action: PermissionAction.View, type: PermissionType.User }"> <a class="nav-link" routerLink="usersgroups" routerLinkActive="active" (click)="closeMenu()"
<a class="nav-link" routerLink="usersgroups" routerLinkActive="active" (click)="closeMenu()" ngbPopover="Users & Groups" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end"
ngbPopover="Users & Groups" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end" container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim">
container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim"> <i-bs class="me-1" name="people"></i-bs><span>&nbsp;<ng-container i18n>Users & Groups</ng-container></span>
<i-bs class="me-1" name="people"></i-bs><span>&nbsp;<ng-container i18n>Users & Groups</ng-container></span> </a>
</a> </li>
</li> <li class="nav-item app-link"
<li class="nav-item app-link" *pngxIfPermissions="{ action: PermissionAction.View, type: PermissionType.PaperlessTask }"
*pngxIfPermissions="{ action: PermissionAction.View, type: PermissionType.PaperlessTask }" tourAnchor="tour.file-tasks">
tourAnchor="tour.file-tasks"> <a class="nav-link" routerLink="tasks" routerLinkActive="active" (click)="closeMenu()"
<a class="nav-link" routerLink="tasks" routerLinkActive="active" (click)="closeMenu()" ngbPopover="File Tasks" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end"
ngbPopover="File Tasks" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end" container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim">
container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim"> <i-bs class="me-1" name="list-task"></i-bs><span>&nbsp;<ng-container i18n>File Tasks</ng-container>@if (tasksService.failedFileTasks.length > 0) {
<i-bs class="me-1" name="list-task"></i-bs><span>&nbsp;<ng-container i18n>File Tasks</ng-container>@if (tasksService.failedFileTasks.length > 0) { <span><span class="badge bg-danger ms-2 d-inline">{{tasksService.failedFileTasks.length}}</span></span>
<span><span class="badge bg-danger ms-2 d-inline">{{tasksService.failedFileTasks.length}}</span></span> }</span>
}</span> @if (tasksService.failedFileTasks.length > 0 && slimSidebarEnabled) {
@if (tasksService.failedFileTasks.length > 0 && slimSidebarEnabled) { <span class="badge bg-danger position-absolute top-0 end-0 d-none d-md-block">{{tasksService.failedFileTasks.length}}</span>
<span class="badge bg-danger position-absolute top-0 end-0 d-none d-md-block">{{tasksService.failedFileTasks.length}}</span> }
} </a>
</a> </li>
</li> @if (permissionsService.isAdmin()) {
@if (permissionsService.isAdmin()) { <li class="nav-item app-link">
<li class="nav-item app-link"> <a class="nav-link" routerLink="logs" routerLinkActive="active" (click)="closeMenu()" ngbPopover="Logs"
<a class="nav-link" routerLink="logs" routerLinkActive="active" (click)="closeMenu()" ngbPopover="Logs"
i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end" container="body"
triggers="mouseenter:mouseleave" popoverClass="popover-slim">
<i-bs class="me-1" name="text-left"></i-bs><span>&nbsp;<ng-container i18n>Logs</ng-container></span>
</a>
</li>
}
</ul>
<ul class="nav flex-column">
<li class="nav-item mt-2" tourAnchor="tour.outro">
<a class="px-3 py-2 text-muted small d-flex align-items-center flex-wrap text-decoration-none"
target="_blank" rel="noopener noreferrer" href="https://docs.paperless-ngx.com" ngbPopover="Documentation"
i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end" container="body" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end" container="body"
triggers="mouseenter:mouseleave" popoverClass="popover-slim"> triggers="mouseenter:mouseleave" popoverClass="popover-slim">
<i-bs class="d-flex" name="question-circle"></i-bs><span class="ms-1">&nbsp;<ng-container i18n>Documentation</ng-container></span> <i-bs class="me-1" name="text-left"></i-bs><span>&nbsp;<ng-container i18n>Logs</ng-container></span>
</a> </a>
</li> </li>
<li class="nav-item" [class.visually-hidden]="slimSidebarEnabled"> }
<div class="px-3 py-0 text-muted small d-flex align-items-center flex-wrap"> <li class="nav-item mt-2" tourAnchor="tour.outro">
<div class="me-3"> <a class="px-3 py-2 text-muted small d-flex align-items-center flex-wrap text-decoration-none"
<a class="text-muted text-decoration-none" target="_blank" rel="noopener noreferrer" target="_blank" rel="noopener noreferrer" href="https://docs.paperless-ngx.com" ngbPopover="Documentation"
href="https://github.com/paperless-ngx/paperless-ngx" ngbPopover="GitHub" i18n-ngbPopover i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end" container="body"
[disablePopover]="!slimSidebarEnabled" placement="end" container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim">
triggers="mouseenter:mouseleave" popoverClass="popover-slim"> <i-bs class="d-flex" name="question-circle"></i-bs><span class="ms-1">&nbsp;<ng-container i18n>Documentation</ng-container></span>
{{ versionString }} </a>
</a> </li>
</div> <li class="nav-item" [class.visually-hidden]="slimSidebarEnabled">
@if (!settingsService.updateCheckingIsSet || appRemoteVersion) { <div class="px-3 py-0 text-muted small d-flex align-items-center flex-wrap">
<div class="version-check"> <div class="me-3">
<ng-template #updateAvailablePopContent> <a class="text-muted text-decoration-none" target="_blank" rel="noopener noreferrer"
<span class="small">Paperless-ngx {{ appRemoteVersion.version }} <ng-container i18n>is href="https://github.com/paperless-ngx/paperless-ngx" ngbPopover="GitHub" i18n-ngbPopover
available.</ng-container><br /><ng-container i18n>Click to view.</ng-container></span> [disablePopover]="!slimSidebarEnabled" placement="end" container="body"
</ng-template> triggers="mouseenter:mouseleave" popoverClass="popover-slim">
<ng-template #updateCheckingNotEnabledPopContent> {{ versionString }}
<p class="small mb-2"> </a>
<ng-container i18n>Paperless-ngx can automatically check for updates</ng-container> </div>
</p> @if (!settingsService.updateCheckingIsSet || appRemoteVersion) {
<div class="btn-group btn-group-xs flex-fill w-100"> <div class="version-check">
<button class="btn btn-outline-primary" (click)="setUpdateChecking(true)">Enable</button> <ng-template #updateAvailablePopContent>
<button class="btn btn-outline-secondary" (click)="setUpdateChecking(false)">Disable</button> <span class="small">Paperless-ngx {{ appRemoteVersion.version }} <ng-container i18n>is
</div> available.</ng-container><br /><ng-container i18n>Click to view.</ng-container></span>
<p class="small mb-0 mt-2"> </ng-template>
<a class="small text-decoration-none fst-italic" routerLink="/settings" fragment="update-checking" i18n> <ng-template #updateCheckingNotEnabledPopContent>
How does this work? <p class="small mb-2">
</a> <ng-container i18n>Paperless-ngx can automatically check for updates</ng-container>
</p> </p>
</ng-template> <div class="btn-group btn-group-xs flex-fill w-100">
@if (settingsService.updateCheckingIsSet) { <button class="btn btn-outline-primary" (click)="setUpdateChecking(true)">Enable</button>
@if (appRemoteVersion.update_available) { <button class="btn btn-outline-secondary" (click)="setUpdateChecking(false)">Disable</button>
<a class="small text-decoration-none" target="_blank" rel="noopener noreferrer" </div>
href="https://github.com/paperless-ngx/paperless-ngx/releases" <p class="small mb-0 mt-2">
[ngbPopover]="updateAvailablePopContent" popoverClass="shadow" triggers="mouseenter:mouseleave" <a class="small text-decoration-none fst-italic" routerLink="/settings" fragment="update-checking" i18n>
container="body"> How does this work?
<i-bs width="1.2em" height="1.2em" name="info-circle"></i-bs> </a>
@if (appRemoteVersion?.update_available) { </p>
&nbsp;<ng-container i18n>Update available</ng-container> </ng-template>
} @if (settingsService.updateCheckingIsSet) {
</a> @if (appRemoteVersion.update_available) {
} <a class="small text-decoration-none" target="_blank" rel="noopener noreferrer"
} @else { href="https://github.com/paperless-ngx/paperless-ngx/releases"
<a *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.UISettings }" class="small text-decoration-none" routerLink="/settings" fragment="update-checking" [ngbPopover]="updateAvailablePopContent" popoverClass="shadow" triggers="mouseenter:mouseleave"
[ngbPopover]="updateCheckingNotEnabledPopContent" popoverClass="shadow" triggers="mouseenter"
container="body"> container="body">
<i-bs width="1.2em" height="1.2em" name="info-circle"></i-bs> <i-bs width="1.2em" height="1.2em" name="info-circle"></i-bs>
@if (appRemoteVersion?.update_available) {
&nbsp;<ng-container i18n>Update available</ng-container>
}
</a> </a>
} }
</div> } @else {
} <a *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.UISettings }" class="small text-decoration-none" routerLink="/settings" fragment="update-checking"
</div> [ngbPopover]="updateCheckingNotEnabledPopContent" popoverClass="shadow" triggers="mouseenter"
</li> container="body">
</ul> <i-bs width="1.2em" height="1.2em" name="info-circle"></i-bs>
</div> </a>
}
</div>
}
</div>
</li>
</ul>
</div> </div>
</div> </div>
</nav> </nav>

View File

@@ -78,8 +78,6 @@ export class AppFrameComponent
appRemoteVersion: AppRemoteVersion appRemoteVersion: AppRemoteVersion
isMenuCollapsed: boolean = true isMenuCollapsed: boolean = true
isManageMenuCollapsed: boolean = false
isAdminMenuCollapsed: boolean = false
slimSidebarAnimating: boolean = false slimSidebarAnimating: boolean = false

View File

@@ -55,8 +55,6 @@ import {
checkLg, checkLg,
chevronDoubleLeft, chevronDoubleLeft,
chevronDoubleRight, chevronDoubleRight,
chevronDown,
chevronUp,
clipboard, clipboard,
clipboardCheck, clipboardCheck,
clipboardCheckFill, clipboardCheckFill,
@@ -262,8 +260,6 @@ const icons = {
checkAll, checkAll,
checkCircleFill, checkCircleFill,
checkLg, checkLg,
chevronDown,
chevronUp,
chevronDoubleLeft, chevronDoubleLeft,
chevronDoubleRight, chevronDoubleRight,
clipboard, clipboard,

View File

@@ -650,7 +650,7 @@ class DocumentViewSet(
) )
def get_metadata(self, file, mime_type): def get_metadata(self, file, mime_type):
if not Path(file).is_file(): if not os.path.isfile(file):
return None return None
parser_class = get_parser_class_for_mime_type(mime_type) parser_class = get_parser_class_for_mime_type(mime_type)
@@ -668,8 +668,8 @@ class DocumentViewSet(
return [] return []
def get_filesize(self, filename): def get_filesize(self, filename):
if Path(filename).is_file(): if os.path.isfile(filename):
return Path(filename).stat().st_size return os.stat(filename).st_size
else: else:
return None return None
@@ -1215,37 +1215,31 @@ class UnifiedSearchViewSet(DocumentViewSet):
class LogViewSet(ViewSet): class LogViewSet(ViewSet):
permission_classes = (IsAuthenticated, PaperlessAdminPermissions) permission_classes = (IsAuthenticated, PaperlessAdminPermissions)
ALLOWED_LOG_FILES = { log_files = ["paperless", "mail", "celery"]
"paperless": "paperless.log",
"mail": "mail.log",
"celery": "celery.log",
}
def get_log_file(self, log_key: str) -> Path: def get_log_filename(self, log):
return Path(settings.LOGGING_DIR) / self.ALLOWED_LOG_FILES[log_key] return os.path.join(settings.LOGGING_DIR, f"{log}.log")
def retrieve(self, request, *args, **kwargs): def retrieve(self, request, *args, **kwargs):
log_key = kwargs.get("pk") log_file = kwargs.get("pk")
if log_key not in self.ALLOWED_LOG_FILES: if log_file not in self.log_files:
raise Http404 raise Http404
log_file = self.get_log_file(log_key) filename = self.get_log_filename(log_file)
if not log_file.is_file(): if not os.path.isfile(filename):
raise Http404 raise Http404
with log_file.open() as f: with open(filename) as f:
lines = [line.rstrip() for line in f.readlines()] lines = [line.rstrip() for line in f.readlines()]
return Response(lines) return Response(lines)
def list(self, request, *args, **kwargs): def list(self, request, *args, **kwargs):
existing_logs = [ exist = [
log_key log for log in self.log_files if os.path.isfile(self.get_log_filename(log))
for log_key in self.ALLOWED_LOG_FILES
if self.get_log_file(log_key).is_file()
] ]
return Response(existing_logs) return Response(exist)
class SavedViewViewSet(ModelViewSet, PassUserMixin): class SavedViewViewSet(ModelViewSet, PassUserMixin):
@@ -2079,7 +2073,7 @@ class BulkDownloadView(GenericAPIView):
strategy.add_document(document) strategy.add_document(document)
# TODO(stumpylog): Investigate using FileResponse here # TODO(stumpylog): Investigate using FileResponse here
with Path(temp.name).open("rb") as f: with open(temp.name, "rb") as f:
response = HttpResponse(f, content_type="application/zip") response = HttpResponse(f, content_type="application/zip")
response["Content-Disposition"] = '{}; filename="{}"'.format( response["Content-Disposition"] = '{}; filename="{}"'.format(
"attachment", "attachment",

View File

@@ -3,7 +3,6 @@ import os
import pwd import pwd
import shutil import shutil
import stat import stat
from pathlib import Path
from django.conf import settings from django.conf import settings
from django.core.checks import Error from django.core.checks import Error
@@ -20,23 +19,26 @@ writeable_hint = (
) )
def path_check(var, directory: Path) -> list[Error]: def path_check(var, directory):
messages: list[Error] = [] messages = []
if directory: if directory:
if not directory.is_dir(): if not os.path.isdir(directory):
messages.append( messages.append(
Error(exists_message.format(var), exists_hint.format(directory)), Error(exists_message.format(var), exists_hint.format(directory)),
) )
else: else:
test_file: Path = directory / f"__paperless_write_test_{os.getpid()}__" test_file = os.path.join(
directory,
f"__paperless_write_test_{os.getpid()}__",
)
try: try:
with test_file.open("w"): with open(test_file, "w"):
pass pass
except PermissionError: except PermissionError:
dir_stat: os.stat_result = Path(directory).stat() dir_stat = os.stat(directory)
dir_mode: str = stat.filemode(dir_stat.st_mode) dir_mode = stat.filemode(dir_stat.st_mode)
dir_owner: str = pwd.getpwuid(dir_stat.st_uid).pw_name dir_owner = pwd.getpwuid(dir_stat.st_uid).pw_name
dir_group: str = grp.getgrgid(dir_stat.st_gid).gr_name dir_group = grp.getgrgid(dir_stat.st_gid).gr_name
messages.append( messages.append(
Error( Error(
writeable_message.format(var), writeable_message.format(var),
@@ -46,18 +48,14 @@ def path_check(var, directory: Path) -> list[Error]:
), ),
) )
finally: finally:
try: if os.path.isfile(test_file):
if test_file.is_file(): os.remove(test_file)
test_file.unlink()
except (PermissionError, OSError):
# Skip cleanup if we can't access the file — expected in permission tests
pass
return messages return messages
@register() @register()
def paths_check(app_configs, **kwargs) -> list[Error]: def paths_check(app_configs, **kwargs):
""" """
Check the various paths for existence, readability and writeability Check the various paths for existence, readability and writeability
""" """

View File

@@ -317,6 +317,7 @@ INSTALLED_APPS = [
"paperless_tesseract.apps.PaperlessTesseractConfig", "paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig", "paperless_text.apps.PaperlessTextConfig",
"paperless_mail.apps.PaperlessMailConfig", "paperless_mail.apps.PaperlessMailConfig",
"paperless_remote.apps.PaperlessRemoteParserConfig",
"django.contrib.admin", "django.contrib.admin",
"rest_framework", "rest_framework",
"rest_framework.authtoken", "rest_framework.authtoken",
@@ -1277,3 +1278,11 @@ OUTLOOK_OAUTH_ENABLED = bool(
and OUTLOOK_OAUTH_CLIENT_ID and OUTLOOK_OAUTH_CLIENT_ID
and OUTLOOK_OAUTH_CLIENT_SECRET, and OUTLOOK_OAUTH_CLIENT_SECRET,
) )
###############################################################################
# Remote Parser #
###############################################################################
REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")

View File

@@ -27,9 +27,9 @@ class TestChecks(DirectoriesMixin, TestCase):
self.assertEqual(paths_check(None), []) self.assertEqual(paths_check(None), [])
@override_settings( @override_settings(
MEDIA_ROOT=Path("uuh"), MEDIA_ROOT="uuh",
DATA_DIR=Path("whatever"), DATA_DIR="whatever",
CONSUMPTION_DIR=Path("idontcare"), CONSUMPTION_DIR="idontcare",
) )
def test_paths_check_dont_exist(self): def test_paths_check_dont_exist(self):
msgs = paths_check(None) msgs = paths_check(None)

View File

@@ -1,6 +1,7 @@
import datetime import datetime
import itertools import itertools
import logging import logging
import os
import ssl import ssl
import tempfile import tempfile
import traceback import traceback
@@ -483,7 +484,7 @@ class MailAccountHandler(LoggingMixin):
return message.subject return message.subject
elif rule.assign_title_from == MailRule.TitleSource.FROM_FILENAME: elif rule.assign_title_from == MailRule.TitleSource.FROM_FILENAME:
return Path(att.filename).stem return os.path.splitext(os.path.basename(att.filename))[0]
elif rule.assign_title_from == MailRule.TitleSource.NONE: elif rule.assign_title_from == MailRule.TitleSource.NONE:
return None return None
@@ -907,7 +908,7 @@ class MailAccountHandler(LoggingMixin):
dir=settings.SCRATCH_DIR, dir=settings.SCRATCH_DIR,
suffix=".eml", suffix=".eml",
) )
with Path(temp_filename).open("wb") as f: with open(temp_filename, "wb") as f:
# Move "From"-header to beginning of file # Move "From"-header to beginning of file
# TODO: This ugly workaround is needed because the parser is # TODO: This ugly workaround is needed because the parser is
# chosen only by the mime_type detected via magic # chosen only by the mime_type detected via magic

View File

@@ -0,0 +1,4 @@
# this is here so that django finds the checks.
from paperless_remote.checks import check_remote_parser_configured
__all__ = ["check_remote_parser_configured"]

View File

@@ -0,0 +1,14 @@
from django.apps import AppConfig
from paperless_remote.signals import remote_consumer_declaration
class PaperlessRemoteParserConfig(AppConfig):
name = "paperless_remote"
def ready(self):
from documents.signals import document_consumer_declaration
document_consumer_declaration.connect(remote_consumer_declaration)
AppConfig.ready(self)

View File

@@ -0,0 +1,15 @@
from django.conf import settings
from django.core.checks import Error
from django.core.checks import register
@register()
def check_remote_parser_configured(app_configs, **kwargs):
if settings.REMOTE_OCR_ENGINE == "azureai" and not settings.REMOTE_OCR_ENDPOINT:
return [
Error(
"Azure AI remote parser requires endpoint to be configured.",
),
]
return []

View File

@@ -0,0 +1,108 @@
from pathlib import Path
from django.conf import settings
from paperless_tesseract.parsers import RasterisedDocumentParser
class RemoteEngineConfig:
def __init__(
self,
engine: str,
api_key: str | None = None,
endpoint: str | None = None,
):
self.engine = engine
self.api_key = api_key
self.endpoint = endpoint
def engine_is_valid(self):
valid = self.engine in ["azureai"] and self.api_key is not None
if self.engine == "azureai":
valid = valid and self.endpoint is not None
return valid
class RemoteDocumentParser(RasterisedDocumentParser):
"""
This parser uses a remote ocr engine to parse documents
"""
logging_name = "paperless.parsing.remote"
def get_settings(self) -> RemoteEngineConfig:
"""
This parser uses the OCR configuration settings to parse documents
"""
return RemoteEngineConfig(
engine=settings.REMOTE_OCR_ENGINE,
api_key=settings.REMOTE_OCR_API_KEY,
endpoint=settings.REMOTE_OCR_ENDPOINT,
)
def supported_mime_types(self):
if self.settings.engine_is_valid():
return [
"application/pdf",
"image/png",
"image/jpeg",
"image/tiff",
"image/bmp",
"image/gif",
"image/webp",
]
else:
return []
def azure_ai_vision_parse(
self,
file: Path,
) -> str | None:
"""
This method uses the Azure AI Vision API to parse documents
"""
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from azure.ai.documentintelligence.models import AnalyzeOutputOption
from azure.ai.documentintelligence.models import DocumentContentFormat
from azure.core.credentials import AzureKeyCredential
client = DocumentIntelligenceClient(
endpoint=self.settings.endpoint,
credential=AzureKeyCredential(self.settings.api_key),
)
with file.open("rb") as f:
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
poller = client.begin_analyze_document(
model_id="prebuilt-read",
body=analyze_request,
output_content_format=DocumentContentFormat.TEXT,
output=[AnalyzeOutputOption.PDF], # request searchable PDF output
content_type="application/json",
)
poller.wait()
result_id = poller.details["operation_id"]
result = poller.result()
# Download the PDF with embedded text
self.archive_path = Path(self.tempdir) / "archive.pdf"
with self.archive_path.open("wb") as f:
for chunk in client.get_analyze_result_pdf(
model_id="prebuilt-read",
result_id=result_id,
):
f.write(chunk)
return result.content
def parse(self, document_path: Path, mime_type, file_name=None):
if not self.settings.engine_is_valid():
self.log.warning(
"No valid remote parser engine is configured, content will be empty.",
)
self.text = ""
return
elif self.settings.engine == "azureai":
self.text = self.azure_ai_vision_parse(document_path)

View File

@@ -0,0 +1,18 @@
def get_parser(*args, **kwargs):
from paperless_remote.parsers import RemoteDocumentParser
return RemoteDocumentParser(*args, **kwargs)
def get_supported_mime_types():
from paperless_remote.parsers import RemoteDocumentParser
return RemoteDocumentParser(None).supported_mime_types()
def remote_consumer_declaration(sender, **kwargs):
return {
"parser": get_parser,
"weight": 5,
"mime_types": get_supported_mime_types(),
}

View File

Binary file not shown.

View File

@@ -0,0 +1,29 @@
from django.test import TestCase
from django.test import override_settings
from paperless_remote import check_remote_parser_configured
class TestChecks(TestCase):
@override_settings(REMOTE_OCR_ENGINE=None)
def test_no_engine(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 0)
@override_settings(REMOTE_OCR_ENGINE="azureai")
@override_settings(REMOTE_OCR_API_KEY="somekey")
@override_settings(REMOTE_OCR_ENDPOINT=None)
def test_azure_no_endpoint(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 1)
self.assertTrue(
msgs[0].msg.startswith(
"Azure AI Vision remote parser requires endpoint to be configured.",
),
)
@override_settings(REMOTE_OCR_ENGINE="something")
@override_settings(REMOTE_OCR_API_KEY="somekey")
def test_valid_configuration(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 0)

View File

@@ -0,0 +1,67 @@
import uuid
from pathlib import Path
from unittest import mock
from django.test import TestCase
from django.test import override_settings
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless_remote.parsers import RemoteDocumentParser
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
def assertContainsStrings(self, content, strings):
# Asserts that all strings appear in content, in the given order.
indices = []
for s in strings:
if s in content:
indices.append(content.index(s))
else:
self.fail(f"'{s}' is not in '{content}'")
self.assertListEqual(indices, sorted(indices))
@mock.patch("paperless_remote.parsers.subprocess.run")
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
def test_get_text_with_azure(self, mock_client_cls, mock_subprocess):
# Arrange mock Azure client
mock_client = mock.Mock()
mock_client_cls.return_value = mock_client
# Simulate poller result and its `.details`
mock_poller = mock.Mock()
mock_poller.wait.return_value = None
mock_poller.details = {"operation_id": "fake-op-id"}
mock_client.begin_analyze_document.return_value = mock_poller
# Return dummy PDF bytes
mock_client.get_analyze_result_pdf.return_value = [
b"%PDF-",
b"1.7 ",
b"FAKEPDF",
]
# Simulate pdftotext by writing dummy text to sidecar file
def fake_run(cmd, *args, **kwargs):
with Path(cmd[-1]).open("w", encoding="utf-8") as f:
f.write("This is a test document.")
mock_subprocess.side_effect = fake_run
with override_settings(
REMOTE_OCR_ENGINE="azureai",
REMOTE_OCR_API_KEY="somekey",
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
):
parser = RemoteDocumentParser(uuid.uuid4())
parser.parse(
self.SAMPLE_FILES / "simple-digital.pdf",
"application/pdf",
)
self.assertContainsStrings(
parser.text.strip(),
["This is a test document."],
)

View File

@@ -1,3 +1,4 @@
import os
import shutil import shutil
import tempfile import tempfile
import uuid import uuid
@@ -69,13 +70,13 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(uuid.uuid4()) parser = RasterisedDocumentParser(uuid.uuid4())
page_count = parser.get_page_count( page_count = parser.get_page_count(
(self.SAMPLE_FILES / "simple-digital.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
"application/pdf", "application/pdf",
) )
self.assertEqual(page_count, 1) self.assertEqual(page_count, 1)
page_count = parser.get_page_count( page_count = parser.get_page_count(
(self.SAMPLE_FILES / "multi-page-mixed.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
"application/pdf", "application/pdf",
) )
self.assertEqual(page_count, 6) self.assertEqual(page_count, 6)
@@ -92,7 +93,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(uuid.uuid4()) parser = RasterisedDocumentParser(uuid.uuid4())
with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm: with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm:
page_count = parser.get_page_count( page_count = parser.get_page_count(
(self.SAMPLE_FILES / "password-protected.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "password-protected.pdf"),
"application/pdf", "application/pdf",
) )
self.assertEqual(page_count, None) self.assertEqual(page_count, None)
@@ -101,7 +102,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_thumbnail(self): def test_thumbnail(self):
parser = RasterisedDocumentParser(uuid.uuid4()) parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail( thumb = parser.get_thumbnail(
(self.SAMPLE_FILES / "simple-digital.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsFile(thumb) self.assertIsFile(thumb)
@@ -118,7 +119,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(uuid.uuid4()) parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail( thumb = parser.get_thumbnail(
(self.SAMPLE_FILES / "simple-digital.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsFile(thumb) self.assertIsFile(thumb)
@@ -126,7 +127,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_thumbnail_encrypted(self): def test_thumbnail_encrypted(self):
parser = RasterisedDocumentParser(uuid.uuid4()) parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail( thumb = parser.get_thumbnail(
(self.SAMPLE_FILES / "encrypted.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "encrypted.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsFile(thumb) self.assertIsFile(thumb)
@@ -134,17 +135,17 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_get_dpi(self): def test_get_dpi(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
dpi = parser.get_dpi((self.SAMPLE_FILES / "simple-no-dpi.png").as_posix()) dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"))
self.assertEqual(dpi, None) self.assertEqual(dpi, None)
dpi = parser.get_dpi((self.SAMPLE_FILES / "simple.png").as_posix()) dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple.png"))
self.assertEqual(dpi, 72) self.assertEqual(dpi, 72)
def test_simple_digital(self): def test_simple_digital(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "simple-digital.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
"application/pdf", "application/pdf",
) )
@@ -156,7 +157,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "with-form.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "with-form.pdf"),
"application/pdf", "application/pdf",
) )
@@ -172,7 +173,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "with-form.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "with-form.pdf"),
"application/pdf", "application/pdf",
) )
@@ -186,7 +187,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_signed(self): def test_signed(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse((self.SAMPLE_FILES / "signed.pdf").as_posix(), "application/pdf") parser.parse(os.path.join(self.SAMPLE_FILES, "signed.pdf"), "application/pdf")
self.assertIsNone(parser.archive_path) self.assertIsNone(parser.archive_path)
self.assertContainsStrings( self.assertContainsStrings(
@@ -202,7 +203,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "encrypted.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "encrypted.pdf"),
"application/pdf", "application/pdf",
) )
@@ -213,7 +214,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_with_form_error_notext(self): def test_with_form_error_notext(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "with-form.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "with-form.pdf"),
"application/pdf", "application/pdf",
) )
@@ -227,7 +228,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "with-form.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "with-form.pdf"),
"application/pdf", "application/pdf",
) )
@@ -239,7 +240,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_image_simple(self): def test_image_simple(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse((self.SAMPLE_FILES / "simple.png").as_posix(), "image/png") parser.parse(os.path.join(self.SAMPLE_FILES, "simple.png"), "image/png")
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@@ -251,11 +252,11 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
with tempfile.TemporaryDirectory() as tempdir: with tempfile.TemporaryDirectory() as tempdir:
# Copy sample file to temp directory, as the parsing changes the file # Copy sample file to temp directory, as the parsing changes the file
# and this makes it modified to Git # and this makes it modified to Git
sample_file = self.SAMPLE_FILES / "simple-alpha.png" sample_file = os.path.join(self.SAMPLE_FILES, "simple-alpha.png")
dest_file = Path(tempdir) / "simple-alpha.png" dest_file = os.path.join(tempdir, "simple-alpha.png")
shutil.copy(sample_file, dest_file) shutil.copy(sample_file, dest_file)
parser.parse(dest_file.as_posix(), "image/png") parser.parse(dest_file, "image/png")
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@@ -265,7 +266,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
dpi = parser.calculate_a4_dpi( dpi = parser.calculate_a4_dpi(
(self.SAMPLE_FILES / "simple-no-dpi.png").as_posix(), os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"),
) )
self.assertEqual(dpi, 62) self.assertEqual(dpi, 62)
@@ -277,7 +278,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def f(): def f():
parser.parse( parser.parse(
(self.SAMPLE_FILES / "simple-no-dpi.png").as_posix(), os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"),
"image/png", "image/png",
) )
@@ -287,7 +288,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_image_no_dpi_default(self): def test_image_no_dpi_default(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse((self.SAMPLE_FILES / "simple-no-dpi.png").as_posix(), "image/png") parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@@ -299,7 +300,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_multi_page(self): def test_multi_page(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@@ -312,7 +313,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_multi_page_pages_skip(self): def test_multi_page_pages_skip(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@@ -325,7 +326,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_multi_page_pages_redo(self): def test_multi_page_pages_redo(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@@ -338,7 +339,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_multi_page_pages_force(self): def test_multi_page_pages_force(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@@ -351,7 +352,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_multi_page_analog_pages_skip(self): def test_multi_page_analog_pages_skip(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@@ -375,7 +376,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@@ -397,7 +398,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@@ -419,7 +420,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsNone(parser.archive_path) self.assertIsNone(parser.archive_path)
@@ -442,7 +443,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf", "application/pdf",
) )
@@ -467,7 +468,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsNotNone(parser.archive_path) self.assertIsNotNone(parser.archive_path)
@@ -490,7 +491,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsNotNone(parser.archive_path) self.assertIsNotNone(parser.archive_path)
@@ -513,7 +514,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsNone(parser.archive_path) self.assertIsNone(parser.archive_path)
@@ -536,7 +537,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsNotNone(parser.archive_path) self.assertIsNotNone(parser.archive_path)
@@ -559,7 +560,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsNone(parser.archive_path) self.assertIsNone(parser.archive_path)
@@ -582,7 +583,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsNone(parser.archive_path) self.assertIsNone(parser.archive_path)
@@ -605,7 +606,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "multi-page-mixed.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsNotNone(parser.archive_path) self.assertIsNotNone(parser.archive_path)
@@ -615,7 +616,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
["page 1", "page 2", "page 3", "page 4", "page 5", "page 6"], ["page 1", "page 2", "page 3", "page 4", "page 5", "page 6"],
) )
with (parser.tempdir / "sidecar.txt").open() as f: with open(os.path.join(parser.tempdir, "sidecar.txt")) as f:
sidecar = f.read() sidecar = f.read()
self.assertIn("[OCR skipped on page(s) 4-6]", sidecar) self.assertIn("[OCR skipped on page(s) 4-6]", sidecar)
@@ -636,7 +637,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "single-page-mixed.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "single-page-mixed.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsNotNone(parser.archive_path) self.assertIsNotNone(parser.archive_path)
@@ -650,7 +651,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
], ],
) )
with (parser.tempdir / "sidecar.txt").open() as f: with open(os.path.join(parser.tempdir, "sidecar.txt")) as f:
sidecar = f.read().lower() sidecar = f.read().lower()
self.assertIn("this is some text, but in an image, also on page 1.", sidecar) self.assertIn("this is some text, but in an image, also on page 1.", sidecar)
@@ -673,7 +674,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "multi-page-mixed.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
"application/pdf", "application/pdf",
) )
self.assertIsNone(parser.archive_path) self.assertIsNone(parser.archive_path)
@@ -685,7 +686,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
@override_settings(OCR_MODE="skip", OCR_ROTATE_PAGES=True) @override_settings(OCR_MODE="skip", OCR_ROTATE_PAGES=True)
def test_rotate(self): def test_rotate(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse((self.SAMPLE_FILES / "rotated.pdf").as_posix(), "application/pdf") parser.parse(os.path.join(self.SAMPLE_FILES, "rotated.pdf"), "application/pdf")
self.assertContainsStrings( self.assertContainsStrings(
parser.get_text(), parser.get_text(),
[ [
@@ -707,7 +708,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "multi-page-images.tiff").as_posix(), os.path.join(self.SAMPLE_FILES, "multi-page-images.tiff"),
"image/tiff", "image/tiff",
) )
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@@ -727,7 +728,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
- Text from all pages extracted - Text from all pages extracted
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
sample_file = self.SAMPLE_FILES / "multi-page-images-alpha.tiff" sample_file = os.path.join(self.SAMPLE_FILES, "multi-page-images-alpha.tiff")
with tempfile.NamedTemporaryFile() as tmp_file: with tempfile.NamedTemporaryFile() as tmp_file:
shutil.copy(sample_file, tmp_file.name) shutil.copy(sample_file, tmp_file.name)
parser.parse( parser.parse(
@@ -752,9 +753,10 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
- Text from all pages extracted - Text from all pages extracted
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
sample_file = ( sample_file = os.path.join(
self.SAMPLE_FILES / "multi-page-images-alpha-rgb.tiff" self.SAMPLE_FILES,
).as_posix() "multi-page-images-alpha-rgb.tiff",
)
with tempfile.NamedTemporaryFile() as tmp_file: with tempfile.NamedTemporaryFile() as tmp_file:
shutil.copy(sample_file, tmp_file.name) shutil.copy(sample_file, tmp_file.name)
parser.parse( parser.parse(
@@ -843,7 +845,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
(self.SAMPLE_FILES / "rtl-test.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
"application/pdf", "application/pdf",
) )
@@ -858,52 +860,49 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertRaises( self.assertRaises(
ParseError, ParseError,
parser.parse, parser.parse,
(self.SAMPLE_FILES / "simple-digital.pdf").as_posix(), os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
"application/pdf", "application/pdf",
) )
class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase): class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = Path(__file__).parent / "samples" SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
def test_bmp(self): def test_bmp(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse((self.SAMPLE_FILES / "simple.bmp").as_posix(), "image/bmp") parser.parse(os.path.join(self.SAMPLE_FILES, "simple.bmp"), "image/bmp")
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower()) self.assertIn("this is a test document", parser.get_text().lower())
def test_jpg(self): def test_jpg(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse((self.SAMPLE_FILES / "simple.jpg").as_posix(), "image/jpeg") parser.parse(os.path.join(self.SAMPLE_FILES, "simple.jpg"), "image/jpeg")
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower()) self.assertIn("this is a test document", parser.get_text().lower())
def test_heic(self): def test_heic(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse((self.SAMPLE_FILES / "simple.heic").as_posix(), "image/heic") parser.parse(os.path.join(self.SAMPLE_FILES, "simple.heic"), "image/heic")
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
self.assertIn("pizza", parser.get_text().lower()) self.assertIn("pizza", parser.get_text().lower())
@override_settings(OCR_IMAGE_DPI=200) @override_settings(OCR_IMAGE_DPI=200)
def test_gif(self): def test_gif(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse((self.SAMPLE_FILES / "simple.gif").as_posix(), "image/gif") parser.parse(os.path.join(self.SAMPLE_FILES, "simple.gif"), "image/gif")
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower()) self.assertIn("this is a test document", parser.get_text().lower())
def test_tiff(self): def test_tiff(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse((self.SAMPLE_FILES / "simple.tif").as_posix(), "image/tiff") parser.parse(os.path.join(self.SAMPLE_FILES, "simple.tif"), "image/tiff")
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower()) self.assertIn("this is a test document", parser.get_text().lower())
@override_settings(OCR_IMAGE_DPI=72) @override_settings(OCR_IMAGE_DPI=72)
def test_webp(self): def test_webp(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(os.path.join(self.SAMPLE_FILES, "document.webp"), "image/webp")
(self.SAMPLE_FILES / "document.webp").as_posix(),
"image/webp",
)
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
# Older tesseracts consistently mangle the space between "a webp", # Older tesseracts consistently mangle the space between "a webp",
# tesseract 5.3.0 seems to do a better job, so we're accepting both # tesseract 5.3.0 seems to do a better job, so we're accepting both

39
uv.lock generated
View File

@@ -93,6 +93,34 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/af/cc/55a32a2c98022d88812b5986d2a92c4ff3ee087e83b712ebc703bba452bf/Automat-24.8.1-py3-none-any.whl", hash = "sha256:bf029a7bc3da1e2c24da2343e7598affaa9f10bf0ab63ff808566ce90551e02a", size = 42585, upload-time = "2024-08-19T17:31:56.729Z" }, { url = "https://files.pythonhosted.org/packages/af/cc/55a32a2c98022d88812b5986d2a92c4ff3ee087e83b712ebc703bba452bf/Automat-24.8.1-py3-none-any.whl", hash = "sha256:bf029a7bc3da1e2c24da2343e7598affaa9f10bf0ab63ff808566ce90551e02a", size = 42585, upload-time = "2024-08-19T17:31:56.729Z" },
] ]
[[package]]
name = "azure-ai-documentintelligence"
version = "1.0.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "azure-core", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "isodate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/44/7b/8115cd713e2caa5e44def85f2b7ebd02a74ae74d7113ba20bdd41fd6dd80/azure_ai_documentintelligence-1.0.2.tar.gz", hash = "sha256:4d75a2513f2839365ebabc0e0e1772f5601b3a8c9a71e75da12440da13b63484", size = 170940 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d9/75/c9ec040f23082f54ffb1977ff8f364c2d21c79a640a13d1c1809e7fd6b1a/azure_ai_documentintelligence-1.0.2-py3-none-any.whl", hash = "sha256:e1fb446abbdeccc9759d897898a0fe13141ed29f9ad11fc705f951925822ed59", size = 106005 },
]
[[package]]
name = "azure-core"
version = "1.33.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "six", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/75/aa/7c9db8edd626f1a7d99d09ef7926f6f4fb34d5f9fa00dc394afdfe8e2a80/azure_core-1.33.0.tar.gz", hash = "sha256:f367aa07b5e3005fec2c1e184b882b0b039910733907d001c20fb08ebb8c0eb9", size = 295633 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/07/b7/76b7e144aa53bd206bf1ce34fa75350472c3f69bf30e5c8c18bc9881035d/azure_core-1.33.0-py3-none-any.whl", hash = "sha256:9b5b6d0223a1d38c37500e6971118c1e0f13f54951e6893968b38910bc9cda8f", size = 207071 },
]
[[package]] [[package]]
name = "babel" name = "babel"
version = "2.17.0" version = "2.17.0"
@@ -1355,6 +1383,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/c7/fc/4e5a141c3f7c7bed550ac1f69e599e92b6be449dd4677ec09f325cad0955/inotifyrecursive-0.3.5-py3-none-any.whl", hash = "sha256:7e5f4a2e1dc2bef0efa3b5f6b339c41fb4599055a2b54909d020e9e932cc8d2f", size = 8009, upload-time = "2020-11-20T12:38:46.981Z" }, { url = "https://files.pythonhosted.org/packages/c7/fc/4e5a141c3f7c7bed550ac1f69e599e92b6be449dd4677ec09f325cad0955/inotifyrecursive-0.3.5-py3-none-any.whl", hash = "sha256:7e5f4a2e1dc2bef0efa3b5f6b339c41fb4599055a2b54909d020e9e932cc8d2f", size = 8009, upload-time = "2020-11-20T12:38:46.981Z" },
] ]
[[package]]
name = "isodate"
version = "0.7.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320 },
]
[[package]] [[package]]
name = "jinja2" name = "jinja2"
version = "3.1.6" version = "3.1.6"
@@ -1883,6 +1920,7 @@ name = "paperless-ngx"
version = "2.16.3" version = "2.16.3"
source = { virtual = "." } source = { virtual = "." }
dependencies = [ dependencies = [
{ name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "channels", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "channels", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -2013,6 +2051,7 @@ typing = [
[package.metadata] [package.metadata]
requires-dist = [ requires-dist = [
{ name = "azure-ai-documentintelligence", specifier = ">=1.0.2" },
{ name = "bleach", specifier = "~=6.2.0" }, { name = "bleach", specifier = "~=6.2.0" },
{ name = "celery", extras = ["redis"], specifier = "~=5.5.1" }, { name = "celery", extras = ["redis"], specifier = "~=5.5.1" },
{ name = "channels", specifier = "~=4.2" }, { name = "channels", specifier = "~=4.2" },