Compare commits

..

6 Commits

Author SHA1 Message Date
shamoon
d960aa2699 Use output_content_format poller.result to get clean content 2025-06-17 14:51:49 -07:00
shamoon
0fd6d40b37 Some docs 2025-06-17 14:51:49 -07:00
shamoon
c9f724d417 Test 2025-06-17 14:51:48 -07:00
shamoon
4b2c986cb3 This actually works
[ci skip]
2025-06-17 14:51:48 -07:00
shamoon
e2a2705d23 Basic parse 2025-06-17 14:51:48 -07:00
shamoon
ea63481cd4 Ok, restart implementing this with just azure
[ci skip]
2025-06-17 14:51:45 -07:00
23 changed files with 547 additions and 225 deletions

View File

@@ -13,8 +13,8 @@ echo $(date +%s) > /var/run/s6/container_environment/PAPERLESS_START_TIME_S
# Check if we're starting as a non-root user
if [ "$(id --user)" != "0" ]; then
printf "true" > /var/run/s6/container_environment/USER_IS_NON_ROOT
echo "${log_prefix} paperless-ngx docker container running under a user ($(id --user):$(id --group))"
echo "${log_prefix} paperless-ngx docker container running under a user ($(id --user):$(id --group))"
else
printf "/usr/src/paperless" > /var/run/s6/container_environment/HOME
echo "${log_prefix} paperless-ngx docker container starting init as root"
echo "${log_prefix} paperless-ngx docker container starting init as root"
fi

View File

@@ -1708,3 +1708,23 @@ password. All of these options come from their similarly-named [Django settings]
#### [`PAPERLESS_EMAIL_USE_SSL=<bool>`](#PAPERLESS_EMAIL_USE_SSL) {#PAPERLESS_EMAIL_USE_SSL}
: Defaults to false.
## Remote OCR
#### [`PAPERLESS_REMOTE_OCR_ENGINE=<str>`](#PAPERLESS_REMOTE_OCR_ENGINE) {#PAPERLESS_REMOTE_OCR_ENGINE}
: The remote OCR engine to use. Currently only Azure AI is supported as "azureai".
Defaults to None, which disables remote OCR.
#### [`PAPERLESS_REMOTE_OCR_API_KEY=<str>`](#PAPERLESS_REMOTE_OCR_API_KEY) {#PAPERLESS_REMOTE_OCR_API_KEY}
: The API key to use for the remote OCR engine.
Defaults to None.
#### [`PAPERLESS_REMOTE_OCR_ENDPOINT=<str>`](#PAPERLESS_REMOTE_OCR_ENDPOINT) {#PAPERLESS_REMOTE_OCR_ENDPOINT}
: The endpoint to use for the remote OCR engine. This is required for Azure AI.
Defaults to None.

View File

@@ -841,6 +841,18 @@ how regularly you intend to scan documents and use paperless.
performed the task associated with the document, move it to the
inbox.
## Remove OCR
!!! important
This feature is disabled by default and will always remain strictly "opt-in".
Paperless-ngx supports performing OCR on documents using remote services. At the moment, this is limited to
[Microsoft's Azure "Document Intelligence" service](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence).
This is of course a paid service (with a free tier) which requires an Azure account and subscription. Azure AI is not affiliated with
Paperless-ngx in any way. When enabled, Paperless-ngx will automatically send appropriate documents to Azure for OCR processing, bypassing
the local OCR engine. See the [configuration](configuration.md#PAPERLESS_REMOTE_OCR_ENGINE) options for more details.
## Architecture
Paperless-ngx consists of the following components:

View File

@@ -15,6 +15,7 @@ classifiers = [
# This will allow testing to not install a webserver, mysql, etc
dependencies = [
"azure-ai-documentintelligence>=1.0.2",
"bleach~=6.2.0",
"celery[redis]~=5.5.1",
"channels~=4.2",
@@ -221,12 +222,22 @@ lint.per-file-ignores."src/documents/parsers.py" = [
lint.per-file-ignores."src/documents/signals/handlers.py" = [
"PTH",
] # TODO Enable & remove
lint.per-file-ignores."src/documents/views.py" = [
"PTH",
] # TODO Enable & remove
lint.per-file-ignores."src/paperless/checks.py" = [
"PTH",
] # TODO Enable & remove
lint.per-file-ignores."src/paperless/settings.py" = [
"PTH",
] # TODO Enable & remove
lint.per-file-ignores."src/paperless_mail/mail.py" = [
"PTH",
] # TODO Enable & remove
lint.per-file-ignores."src/paperless_tesseract/tests/test_parser.py" = [
"PTH",
"RUF001",
]
] # TODO PTH Enable & remove
lint.isort.force-single-line = true
[tool.pytest.ini_options]

View File

@@ -158,13 +158,10 @@
</div>
<div class="nav-group mt-3 mb-1">
<h6 class="sidebar-heading px-3 text-muted d-flex align-items-center">
<h6 class="sidebar-heading px-3 text-muted">
<span i18n>Manage</span>
<button class="btn btn-link p-2 py-0" (click)="manageCollapse.toggle()">
<i-bs width="0.9em" height="0.9em" [name]="isManageMenuCollapsed ? 'chevron-down' : 'chevron-up'"></i-bs>
</button>
</h6>
<ul class="nav flex-column mb-2" #manageCollapse="ngbCollapse" [(ngbCollapse)]="isManageMenuCollapsed">
<ul class="nav flex-column mb-2">
<li class="nav-item app-link"
*pngxIfPermissions="{ action: PermissionAction.View, type: PermissionType.Correspondent }">
<a class="nav-link" routerLink="correspondents" routerLinkActive="active" (click)="closeMenu()"
@@ -238,124 +235,117 @@
</div>
<div class="nav-group mt-auto mb-1">
<h6 class="sidebar-heading px-3 pt-4 text-muted d-flex align-items-center">
<h6 class="sidebar-heading px-3 pt-4 text-muted">
<span i18n>Administration</span>
<button class="btn btn-link p-2 py-0" (click)="adminCollapse.toggle()">
<i-bs width="0.9em" height="0.9em" [name]="isAdminMenuCollapsed ? 'chevron-down' : 'chevron-up'"></i-bs>
</button>
</h6>
<div class="mb-2">
<ul class="nav flex-column" #adminCollapse="ngbCollapse" [(ngbCollapse)]="isAdminMenuCollapsed">
<li class="nav-item app-link" *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.UISettings }"
tourAnchor="tour.settings">
<a class="nav-link" routerLink="settings" routerLinkActive="active" (click)="closeMenu()"
ngbPopover="Settings" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end"
container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim">
<i-bs class="me-1" name="gear"></i-bs><span>&nbsp;<ng-container i18n>Settings</ng-container></span>
</a>
</li>
<li class="nav-item app-link" *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.AppConfig }">
<a class="nav-link" routerLink="config" routerLinkActive="active" (click)="closeMenu()"
ngbPopover="Configuration" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end"
container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim">
<i-bs class="me-1" name="sliders2-vertical"></i-bs><span>&nbsp;<ng-container i18n>Configuration</ng-container></span>
</a>
</li>
<li class="nav-item app-link" *pngxIfPermissions="{ action: PermissionAction.View, type: PermissionType.User }">
<a class="nav-link" routerLink="usersgroups" routerLinkActive="active" (click)="closeMenu()"
ngbPopover="Users & Groups" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end"
container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim">
<i-bs class="me-1" name="people"></i-bs><span>&nbsp;<ng-container i18n>Users & Groups</ng-container></span>
</a>
</li>
<li class="nav-item app-link"
*pngxIfPermissions="{ action: PermissionAction.View, type: PermissionType.PaperlessTask }"
tourAnchor="tour.file-tasks">
<a class="nav-link" routerLink="tasks" routerLinkActive="active" (click)="closeMenu()"
ngbPopover="File Tasks" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end"
container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim">
<i-bs class="me-1" name="list-task"></i-bs><span>&nbsp;<ng-container i18n>File Tasks</ng-container>@if (tasksService.failedFileTasks.length > 0) {
<span><span class="badge bg-danger ms-2 d-inline">{{tasksService.failedFileTasks.length}}</span></span>
}</span>
@if (tasksService.failedFileTasks.length > 0 && slimSidebarEnabled) {
<span class="badge bg-danger position-absolute top-0 end-0 d-none d-md-block">{{tasksService.failedFileTasks.length}}</span>
}
</a>
</li>
@if (permissionsService.isAdmin()) {
<li class="nav-item app-link">
<a class="nav-link" routerLink="logs" routerLinkActive="active" (click)="closeMenu()" ngbPopover="Logs"
i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end" container="body"
triggers="mouseenter:mouseleave" popoverClass="popover-slim">
<i-bs class="me-1" name="text-left"></i-bs><span>&nbsp;<ng-container i18n>Logs</ng-container></span>
</a>
</li>
}
</ul>
<ul class="nav flex-column">
<li class="nav-item mt-2" tourAnchor="tour.outro">
<a class="px-3 py-2 text-muted small d-flex align-items-center flex-wrap text-decoration-none"
target="_blank" rel="noopener noreferrer" href="https://docs.paperless-ngx.com" ngbPopover="Documentation"
<ul class="nav flex-column mb-2">
<li class="nav-item app-link" *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.UISettings }"
tourAnchor="tour.settings">
<a class="nav-link" routerLink="settings" routerLinkActive="active" (click)="closeMenu()"
ngbPopover="Settings" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end"
container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim">
<i-bs class="me-1" name="gear"></i-bs><span>&nbsp;<ng-container i18n>Settings</ng-container></span>
</a>
</li>
<li class="nav-item app-link" *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.AppConfig }">
<a class="nav-link" routerLink="config" routerLinkActive="active" (click)="closeMenu()"
ngbPopover="Configuration" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end"
container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim">
<i-bs class="me-1" name="sliders2-vertical"></i-bs><span>&nbsp;<ng-container i18n>Configuration</ng-container></span>
</a>
</li>
<li class="nav-item app-link" *pngxIfPermissions="{ action: PermissionAction.View, type: PermissionType.User }">
<a class="nav-link" routerLink="usersgroups" routerLinkActive="active" (click)="closeMenu()"
ngbPopover="Users & Groups" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end"
container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim">
<i-bs class="me-1" name="people"></i-bs><span>&nbsp;<ng-container i18n>Users & Groups</ng-container></span>
</a>
</li>
<li class="nav-item app-link"
*pngxIfPermissions="{ action: PermissionAction.View, type: PermissionType.PaperlessTask }"
tourAnchor="tour.file-tasks">
<a class="nav-link" routerLink="tasks" routerLinkActive="active" (click)="closeMenu()"
ngbPopover="File Tasks" i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end"
container="body" triggers="mouseenter:mouseleave" popoverClass="popover-slim">
<i-bs class="me-1" name="list-task"></i-bs><span>&nbsp;<ng-container i18n>File Tasks</ng-container>@if (tasksService.failedFileTasks.length > 0) {
<span><span class="badge bg-danger ms-2 d-inline">{{tasksService.failedFileTasks.length}}</span></span>
}</span>
@if (tasksService.failedFileTasks.length > 0 && slimSidebarEnabled) {
<span class="badge bg-danger position-absolute top-0 end-0 d-none d-md-block">{{tasksService.failedFileTasks.length}}</span>
}
</a>
</li>
@if (permissionsService.isAdmin()) {
<li class="nav-item app-link">
<a class="nav-link" routerLink="logs" routerLinkActive="active" (click)="closeMenu()" ngbPopover="Logs"
i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end" container="body"
triggers="mouseenter:mouseleave" popoverClass="popover-slim">
<i-bs class="d-flex" name="question-circle"></i-bs><span class="ms-1">&nbsp;<ng-container i18n>Documentation</ng-container></span>
<i-bs class="me-1" name="text-left"></i-bs><span>&nbsp;<ng-container i18n>Logs</ng-container></span>
</a>
</li>
<li class="nav-item" [class.visually-hidden]="slimSidebarEnabled">
<div class="px-3 py-0 text-muted small d-flex align-items-center flex-wrap">
<div class="me-3">
<a class="text-muted text-decoration-none" target="_blank" rel="noopener noreferrer"
href="https://github.com/paperless-ngx/paperless-ngx" ngbPopover="GitHub" i18n-ngbPopover
[disablePopover]="!slimSidebarEnabled" placement="end" container="body"
triggers="mouseenter:mouseleave" popoverClass="popover-slim">
{{ versionString }}
</a>
</div>
@if (!settingsService.updateCheckingIsSet || appRemoteVersion) {
<div class="version-check">
<ng-template #updateAvailablePopContent>
<span class="small">Paperless-ngx {{ appRemoteVersion.version }} <ng-container i18n>is
available.</ng-container><br /><ng-container i18n>Click to view.</ng-container></span>
</ng-template>
<ng-template #updateCheckingNotEnabledPopContent>
<p class="small mb-2">
<ng-container i18n>Paperless-ngx can automatically check for updates</ng-container>
</p>
<div class="btn-group btn-group-xs flex-fill w-100">
<button class="btn btn-outline-primary" (click)="setUpdateChecking(true)">Enable</button>
<button class="btn btn-outline-secondary" (click)="setUpdateChecking(false)">Disable</button>
</div>
<p class="small mb-0 mt-2">
<a class="small text-decoration-none fst-italic" routerLink="/settings" fragment="update-checking" i18n>
How does this work?
</a>
</p>
</ng-template>
@if (settingsService.updateCheckingIsSet) {
@if (appRemoteVersion.update_available) {
<a class="small text-decoration-none" target="_blank" rel="noopener noreferrer"
href="https://github.com/paperless-ngx/paperless-ngx/releases"
[ngbPopover]="updateAvailablePopContent" popoverClass="shadow" triggers="mouseenter:mouseleave"
container="body">
<i-bs width="1.2em" height="1.2em" name="info-circle"></i-bs>
@if (appRemoteVersion?.update_available) {
&nbsp;<ng-container i18n>Update available</ng-container>
}
</a>
}
} @else {
<a *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.UISettings }" class="small text-decoration-none" routerLink="/settings" fragment="update-checking"
[ngbPopover]="updateCheckingNotEnabledPopContent" popoverClass="shadow" triggers="mouseenter"
}
<li class="nav-item mt-2" tourAnchor="tour.outro">
<a class="px-3 py-2 text-muted small d-flex align-items-center flex-wrap text-decoration-none"
target="_blank" rel="noopener noreferrer" href="https://docs.paperless-ngx.com" ngbPopover="Documentation"
i18n-ngbPopover [disablePopover]="!slimSidebarEnabled" placement="end" container="body"
triggers="mouseenter:mouseleave" popoverClass="popover-slim">
<i-bs class="d-flex" name="question-circle"></i-bs><span class="ms-1">&nbsp;<ng-container i18n>Documentation</ng-container></span>
</a>
</li>
<li class="nav-item" [class.visually-hidden]="slimSidebarEnabled">
<div class="px-3 py-0 text-muted small d-flex align-items-center flex-wrap">
<div class="me-3">
<a class="text-muted text-decoration-none" target="_blank" rel="noopener noreferrer"
href="https://github.com/paperless-ngx/paperless-ngx" ngbPopover="GitHub" i18n-ngbPopover
[disablePopover]="!slimSidebarEnabled" placement="end" container="body"
triggers="mouseenter:mouseleave" popoverClass="popover-slim">
{{ versionString }}
</a>
</div>
@if (!settingsService.updateCheckingIsSet || appRemoteVersion) {
<div class="version-check">
<ng-template #updateAvailablePopContent>
<span class="small">Paperless-ngx {{ appRemoteVersion.version }} <ng-container i18n>is
available.</ng-container><br /><ng-container i18n>Click to view.</ng-container></span>
</ng-template>
<ng-template #updateCheckingNotEnabledPopContent>
<p class="small mb-2">
<ng-container i18n>Paperless-ngx can automatically check for updates</ng-container>
</p>
<div class="btn-group btn-group-xs flex-fill w-100">
<button class="btn btn-outline-primary" (click)="setUpdateChecking(true)">Enable</button>
<button class="btn btn-outline-secondary" (click)="setUpdateChecking(false)">Disable</button>
</div>
<p class="small mb-0 mt-2">
<a class="small text-decoration-none fst-italic" routerLink="/settings" fragment="update-checking" i18n>
How does this work?
</a>
</p>
</ng-template>
@if (settingsService.updateCheckingIsSet) {
@if (appRemoteVersion.update_available) {
<a class="small text-decoration-none" target="_blank" rel="noopener noreferrer"
href="https://github.com/paperless-ngx/paperless-ngx/releases"
[ngbPopover]="updateAvailablePopContent" popoverClass="shadow" triggers="mouseenter:mouseleave"
container="body">
<i-bs width="1.2em" height="1.2em" name="info-circle"></i-bs>
@if (appRemoteVersion?.update_available) {
&nbsp;<ng-container i18n>Update available</ng-container>
}
</a>
}
</div>
}
</div>
</li>
</ul>
</div>
} @else {
<a *pngxIfPermissions="{ action: PermissionAction.Change, type: PermissionType.UISettings }" class="small text-decoration-none" routerLink="/settings" fragment="update-checking"
[ngbPopover]="updateCheckingNotEnabledPopContent" popoverClass="shadow" triggers="mouseenter"
container="body">
<i-bs width="1.2em" height="1.2em" name="info-circle"></i-bs>
</a>
}
</div>
}
</div>
</li>
</ul>
</div>
</div>
</nav>

View File

@@ -78,8 +78,6 @@ export class AppFrameComponent
appRemoteVersion: AppRemoteVersion
isMenuCollapsed: boolean = true
isManageMenuCollapsed: boolean = false
isAdminMenuCollapsed: boolean = false
slimSidebarAnimating: boolean = false

View File

@@ -55,8 +55,6 @@ import {
checkLg,
chevronDoubleLeft,
chevronDoubleRight,
chevronDown,
chevronUp,
clipboard,
clipboardCheck,
clipboardCheckFill,
@@ -262,8 +260,6 @@ const icons = {
checkAll,
checkCircleFill,
checkLg,
chevronDown,
chevronUp,
chevronDoubleLeft,
chevronDoubleRight,
clipboard,

View File

@@ -650,7 +650,7 @@ class DocumentViewSet(
)
def get_metadata(self, file, mime_type):
if not Path(file).is_file():
if not os.path.isfile(file):
return None
parser_class = get_parser_class_for_mime_type(mime_type)
@@ -668,8 +668,8 @@ class DocumentViewSet(
return []
def get_filesize(self, filename):
if Path(filename).is_file():
return Path(filename).stat().st_size
if os.path.isfile(filename):
return os.stat(filename).st_size
else:
return None
@@ -1215,37 +1215,31 @@ class UnifiedSearchViewSet(DocumentViewSet):
class LogViewSet(ViewSet):
permission_classes = (IsAuthenticated, PaperlessAdminPermissions)
ALLOWED_LOG_FILES = {
"paperless": "paperless.log",
"mail": "mail.log",
"celery": "celery.log",
}
log_files = ["paperless", "mail", "celery"]
def get_log_file(self, log_key: str) -> Path:
return Path(settings.LOGGING_DIR) / self.ALLOWED_LOG_FILES[log_key]
def get_log_filename(self, log):
return os.path.join(settings.LOGGING_DIR, f"{log}.log")
def retrieve(self, request, *args, **kwargs):
log_key = kwargs.get("pk")
if log_key not in self.ALLOWED_LOG_FILES:
log_file = kwargs.get("pk")
if log_file not in self.log_files:
raise Http404
log_file = self.get_log_file(log_key)
filename = self.get_log_filename(log_file)
if not log_file.is_file():
if not os.path.isfile(filename):
raise Http404
with log_file.open() as f:
with open(filename) as f:
lines = [line.rstrip() for line in f.readlines()]
return Response(lines)
def list(self, request, *args, **kwargs):
existing_logs = [
log_key
for log_key in self.ALLOWED_LOG_FILES
if self.get_log_file(log_key).is_file()
exist = [
log for log in self.log_files if os.path.isfile(self.get_log_filename(log))
]
return Response(existing_logs)
return Response(exist)
class SavedViewViewSet(ModelViewSet, PassUserMixin):
@@ -2079,7 +2073,7 @@ class BulkDownloadView(GenericAPIView):
strategy.add_document(document)
# TODO(stumpylog): Investigate using FileResponse here
with Path(temp.name).open("rb") as f:
with open(temp.name, "rb") as f:
response = HttpResponse(f, content_type="application/zip")
response["Content-Disposition"] = '{}; filename="{}"'.format(
"attachment",

View File

@@ -3,7 +3,6 @@ import os
import pwd
import shutil
import stat
from pathlib import Path
from django.conf import settings
from django.core.checks import Error
@@ -20,23 +19,26 @@ writeable_hint = (
)
def path_check(var, directory: Path) -> list[Error]:
messages: list[Error] = []
def path_check(var, directory):
messages = []
if directory:
if not directory.is_dir():
if not os.path.isdir(directory):
messages.append(
Error(exists_message.format(var), exists_hint.format(directory)),
)
else:
test_file: Path = directory / f"__paperless_write_test_{os.getpid()}__"
test_file = os.path.join(
directory,
f"__paperless_write_test_{os.getpid()}__",
)
try:
with test_file.open("w"):
with open(test_file, "w"):
pass
except PermissionError:
dir_stat: os.stat_result = Path(directory).stat()
dir_mode: str = stat.filemode(dir_stat.st_mode)
dir_owner: str = pwd.getpwuid(dir_stat.st_uid).pw_name
dir_group: str = grp.getgrgid(dir_stat.st_gid).gr_name
dir_stat = os.stat(directory)
dir_mode = stat.filemode(dir_stat.st_mode)
dir_owner = pwd.getpwuid(dir_stat.st_uid).pw_name
dir_group = grp.getgrgid(dir_stat.st_gid).gr_name
messages.append(
Error(
writeable_message.format(var),
@@ -46,18 +48,14 @@ def path_check(var, directory: Path) -> list[Error]:
),
)
finally:
try:
if test_file.is_file():
test_file.unlink()
except (PermissionError, OSError):
# Skip cleanup if we can't access the file — expected in permission tests
pass
if os.path.isfile(test_file):
os.remove(test_file)
return messages
@register()
def paths_check(app_configs, **kwargs) -> list[Error]:
def paths_check(app_configs, **kwargs):
"""
Check the various paths for existence, readability and writeability
"""

View File

@@ -317,6 +317,7 @@ INSTALLED_APPS = [
"paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"paperless_mail.apps.PaperlessMailConfig",
"paperless_remote.apps.PaperlessRemoteParserConfig",
"django.contrib.admin",
"rest_framework",
"rest_framework.authtoken",
@@ -1277,3 +1278,11 @@ OUTLOOK_OAUTH_ENABLED = bool(
and OUTLOOK_OAUTH_CLIENT_ID
and OUTLOOK_OAUTH_CLIENT_SECRET,
)
###############################################################################
# Remote Parser #
###############################################################################
REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")

View File

@@ -27,9 +27,9 @@ class TestChecks(DirectoriesMixin, TestCase):
self.assertEqual(paths_check(None), [])
@override_settings(
MEDIA_ROOT=Path("uuh"),
DATA_DIR=Path("whatever"),
CONSUMPTION_DIR=Path("idontcare"),
MEDIA_ROOT="uuh",
DATA_DIR="whatever",
CONSUMPTION_DIR="idontcare",
)
def test_paths_check_dont_exist(self):
msgs = paths_check(None)

View File

@@ -1,6 +1,7 @@
import datetime
import itertools
import logging
import os
import ssl
import tempfile
import traceback
@@ -483,7 +484,7 @@ class MailAccountHandler(LoggingMixin):
return message.subject
elif rule.assign_title_from == MailRule.TitleSource.FROM_FILENAME:
return Path(att.filename).stem
return os.path.splitext(os.path.basename(att.filename))[0]
elif rule.assign_title_from == MailRule.TitleSource.NONE:
return None
@@ -907,7 +908,7 @@ class MailAccountHandler(LoggingMixin):
dir=settings.SCRATCH_DIR,
suffix=".eml",
)
with Path(temp_filename).open("wb") as f:
with open(temp_filename, "wb") as f:
# Move "From"-header to beginning of file
# TODO: This ugly workaround is needed because the parser is
# chosen only by the mime_type detected via magic

View File

@@ -0,0 +1,4 @@
# this is here so that django finds the checks.
from paperless_remote.checks import check_remote_parser_configured
__all__ = ["check_remote_parser_configured"]

View File

@@ -0,0 +1,14 @@
from django.apps import AppConfig
from paperless_remote.signals import remote_consumer_declaration
class PaperlessRemoteParserConfig(AppConfig):
name = "paperless_remote"
def ready(self):
from documents.signals import document_consumer_declaration
document_consumer_declaration.connect(remote_consumer_declaration)
AppConfig.ready(self)

View File

@@ -0,0 +1,15 @@
from django.conf import settings
from django.core.checks import Error
from django.core.checks import register
@register()
def check_remote_parser_configured(app_configs, **kwargs):
if settings.REMOTE_OCR_ENGINE == "azureai" and not settings.REMOTE_OCR_ENDPOINT:
return [
Error(
"Azure AI remote parser requires endpoint to be configured.",
),
]
return []

View File

@@ -0,0 +1,108 @@
from pathlib import Path
from django.conf import settings
from paperless_tesseract.parsers import RasterisedDocumentParser
class RemoteEngineConfig:
def __init__(
self,
engine: str,
api_key: str | None = None,
endpoint: str | None = None,
):
self.engine = engine
self.api_key = api_key
self.endpoint = endpoint
def engine_is_valid(self):
valid = self.engine in ["azureai"] and self.api_key is not None
if self.engine == "azureai":
valid = valid and self.endpoint is not None
return valid
class RemoteDocumentParser(RasterisedDocumentParser):
"""
This parser uses a remote ocr engine to parse documents
"""
logging_name = "paperless.parsing.remote"
def get_settings(self) -> RemoteEngineConfig:
"""
This parser uses the OCR configuration settings to parse documents
"""
return RemoteEngineConfig(
engine=settings.REMOTE_OCR_ENGINE,
api_key=settings.REMOTE_OCR_API_KEY,
endpoint=settings.REMOTE_OCR_ENDPOINT,
)
def supported_mime_types(self):
if self.settings.engine_is_valid():
return [
"application/pdf",
"image/png",
"image/jpeg",
"image/tiff",
"image/bmp",
"image/gif",
"image/webp",
]
else:
return []
def azure_ai_vision_parse(
self,
file: Path,
) -> str | None:
"""
This method uses the Azure AI Vision API to parse documents
"""
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from azure.ai.documentintelligence.models import AnalyzeOutputOption
from azure.ai.documentintelligence.models import DocumentContentFormat
from azure.core.credentials import AzureKeyCredential
client = DocumentIntelligenceClient(
endpoint=self.settings.endpoint,
credential=AzureKeyCredential(self.settings.api_key),
)
with file.open("rb") as f:
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
poller = client.begin_analyze_document(
model_id="prebuilt-read",
body=analyze_request,
output_content_format=DocumentContentFormat.TEXT,
output=[AnalyzeOutputOption.PDF], # request searchable PDF output
content_type="application/json",
)
poller.wait()
result_id = poller.details["operation_id"]
result = poller.result()
# Download the PDF with embedded text
self.archive_path = Path(self.tempdir) / "archive.pdf"
with self.archive_path.open("wb") as f:
for chunk in client.get_analyze_result_pdf(
model_id="prebuilt-read",
result_id=result_id,
):
f.write(chunk)
return result.content
def parse(self, document_path: Path, mime_type, file_name=None):
if not self.settings.engine_is_valid():
self.log.warning(
"No valid remote parser engine is configured, content will be empty.",
)
self.text = ""
return
elif self.settings.engine == "azureai":
self.text = self.azure_ai_vision_parse(document_path)

View File

@@ -0,0 +1,18 @@
def get_parser(*args, **kwargs):
from paperless_remote.parsers import RemoteDocumentParser
return RemoteDocumentParser(*args, **kwargs)
def get_supported_mime_types():
from paperless_remote.parsers import RemoteDocumentParser
return RemoteDocumentParser(None).supported_mime_types()
def remote_consumer_declaration(sender, **kwargs):
return {
"parser": get_parser,
"weight": 5,
"mime_types": get_supported_mime_types(),
}

View File

Binary file not shown.

View File

@@ -0,0 +1,29 @@
from django.test import TestCase
from django.test import override_settings
from paperless_remote import check_remote_parser_configured
class TestChecks(TestCase):
@override_settings(REMOTE_OCR_ENGINE=None)
def test_no_engine(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 0)
@override_settings(REMOTE_OCR_ENGINE="azureai")
@override_settings(REMOTE_OCR_API_KEY="somekey")
@override_settings(REMOTE_OCR_ENDPOINT=None)
def test_azure_no_endpoint(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 1)
self.assertTrue(
msgs[0].msg.startswith(
"Azure AI Vision remote parser requires endpoint to be configured.",
),
)
@override_settings(REMOTE_OCR_ENGINE="something")
@override_settings(REMOTE_OCR_API_KEY="somekey")
def test_valid_configuration(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 0)

View File

@@ -0,0 +1,67 @@
import uuid
from pathlib import Path
from unittest import mock
from django.test import TestCase
from django.test import override_settings
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless_remote.parsers import RemoteDocumentParser
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
def assertContainsStrings(self, content, strings):
# Asserts that all strings appear in content, in the given order.
indices = []
for s in strings:
if s in content:
indices.append(content.index(s))
else:
self.fail(f"'{s}' is not in '{content}'")
self.assertListEqual(indices, sorted(indices))
@mock.patch("paperless_remote.parsers.subprocess.run")
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
def test_get_text_with_azure(self, mock_client_cls, mock_subprocess):
# Arrange mock Azure client
mock_client = mock.Mock()
mock_client_cls.return_value = mock_client
# Simulate poller result and its `.details`
mock_poller = mock.Mock()
mock_poller.wait.return_value = None
mock_poller.details = {"operation_id": "fake-op-id"}
mock_client.begin_analyze_document.return_value = mock_poller
# Return dummy PDF bytes
mock_client.get_analyze_result_pdf.return_value = [
b"%PDF-",
b"1.7 ",
b"FAKEPDF",
]
# Simulate pdftotext by writing dummy text to sidecar file
def fake_run(cmd, *args, **kwargs):
with Path(cmd[-1]).open("w", encoding="utf-8") as f:
f.write("This is a test document.")
mock_subprocess.side_effect = fake_run
with override_settings(
REMOTE_OCR_ENGINE="azureai",
REMOTE_OCR_API_KEY="somekey",
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
):
parser = RemoteDocumentParser(uuid.uuid4())
parser.parse(
self.SAMPLE_FILES / "simple-digital.pdf",
"application/pdf",
)
self.assertContainsStrings(
parser.text.strip(),
["This is a test document."],
)

View File

@@ -1,3 +1,4 @@
import os
import shutil
import tempfile
import uuid
@@ -69,13 +70,13 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
"""
parser = RasterisedDocumentParser(uuid.uuid4())
page_count = parser.get_page_count(
(self.SAMPLE_FILES / "simple-digital.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
"application/pdf",
)
self.assertEqual(page_count, 1)
page_count = parser.get_page_count(
(self.SAMPLE_FILES / "multi-page-mixed.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
"application/pdf",
)
self.assertEqual(page_count, 6)
@@ -92,7 +93,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(uuid.uuid4())
with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm:
page_count = parser.get_page_count(
(self.SAMPLE_FILES / "password-protected.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "password-protected.pdf"),
"application/pdf",
)
self.assertEqual(page_count, None)
@@ -101,7 +102,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_thumbnail(self):
parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail(
(self.SAMPLE_FILES / "simple-digital.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
"application/pdf",
)
self.assertIsFile(thumb)
@@ -118,7 +119,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail(
(self.SAMPLE_FILES / "simple-digital.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
"application/pdf",
)
self.assertIsFile(thumb)
@@ -126,7 +127,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_thumbnail_encrypted(self):
parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail(
(self.SAMPLE_FILES / "encrypted.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "encrypted.pdf"),
"application/pdf",
)
self.assertIsFile(thumb)
@@ -134,17 +135,17 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_get_dpi(self):
parser = RasterisedDocumentParser(None)
dpi = parser.get_dpi((self.SAMPLE_FILES / "simple-no-dpi.png").as_posix())
dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"))
self.assertEqual(dpi, None)
dpi = parser.get_dpi((self.SAMPLE_FILES / "simple.png").as_posix())
dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple.png"))
self.assertEqual(dpi, 72)
def test_simple_digital(self):
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "simple-digital.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
"application/pdf",
)
@@ -156,7 +157,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "with-form.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "with-form.pdf"),
"application/pdf",
)
@@ -172,7 +173,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "with-form.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "with-form.pdf"),
"application/pdf",
)
@@ -186,7 +187,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_signed(self):
parser = RasterisedDocumentParser(None)
parser.parse((self.SAMPLE_FILES / "signed.pdf").as_posix(), "application/pdf")
parser.parse(os.path.join(self.SAMPLE_FILES, "signed.pdf"), "application/pdf")
self.assertIsNone(parser.archive_path)
self.assertContainsStrings(
@@ -202,7 +203,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "encrypted.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "encrypted.pdf"),
"application/pdf",
)
@@ -213,7 +214,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_with_form_error_notext(self):
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "with-form.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "with-form.pdf"),
"application/pdf",
)
@@ -227,7 +228,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "with-form.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "with-form.pdf"),
"application/pdf",
)
@@ -239,7 +240,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_image_simple(self):
parser = RasterisedDocumentParser(None)
parser.parse((self.SAMPLE_FILES / "simple.png").as_posix(), "image/png")
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.png"), "image/png")
self.assertIsFile(parser.archive_path)
@@ -251,11 +252,11 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
with tempfile.TemporaryDirectory() as tempdir:
# Copy sample file to temp directory, as the parsing changes the file
# and this makes it modified to Git
sample_file = self.SAMPLE_FILES / "simple-alpha.png"
dest_file = Path(tempdir) / "simple-alpha.png"
sample_file = os.path.join(self.SAMPLE_FILES, "simple-alpha.png")
dest_file = os.path.join(tempdir, "simple-alpha.png")
shutil.copy(sample_file, dest_file)
parser.parse(dest_file.as_posix(), "image/png")
parser.parse(dest_file, "image/png")
self.assertIsFile(parser.archive_path)
@@ -265,7 +266,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None)
dpi = parser.calculate_a4_dpi(
(self.SAMPLE_FILES / "simple-no-dpi.png").as_posix(),
os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"),
)
self.assertEqual(dpi, 62)
@@ -277,7 +278,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def f():
parser.parse(
(self.SAMPLE_FILES / "simple-no-dpi.png").as_posix(),
os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"),
"image/png",
)
@@ -287,7 +288,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_image_no_dpi_default(self):
parser = RasterisedDocumentParser(None)
parser.parse((self.SAMPLE_FILES / "simple-no-dpi.png").as_posix(), "image/png")
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")
self.assertIsFile(parser.archive_path)
@@ -299,7 +300,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_multi_page(self):
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
@@ -312,7 +313,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_multi_page_pages_skip(self):
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
@@ -325,7 +326,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_multi_page_pages_redo(self):
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
@@ -338,7 +339,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_multi_page_pages_force(self):
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
@@ -351,7 +352,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_multi_page_analog_pages_skip(self):
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
@@ -375,7 +376,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
"""
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
@@ -397,7 +398,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
"""
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsFile(parser.archive_path)
@@ -419,7 +420,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
"""
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
@@ -442,7 +443,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
"""
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
@@ -467,7 +468,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
"""
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
@@ -490,7 +491,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
"""
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
@@ -513,7 +514,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
"""
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
@@ -536,7 +537,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
"""
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
@@ -559,7 +560,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
"""
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
@@ -582,7 +583,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
"""
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
@@ -605,7 +606,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
"""
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "multi-page-mixed.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
@@ -615,7 +616,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
["page 1", "page 2", "page 3", "page 4", "page 5", "page 6"],
)
with (parser.tempdir / "sidecar.txt").open() as f:
with open(os.path.join(parser.tempdir, "sidecar.txt")) as f:
sidecar = f.read()
self.assertIn("[OCR skipped on page(s) 4-6]", sidecar)
@@ -636,7 +637,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
"""
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "single-page-mixed.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "single-page-mixed.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
@@ -650,7 +651,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
],
)
with (parser.tempdir / "sidecar.txt").open() as f:
with open(os.path.join(parser.tempdir, "sidecar.txt")) as f:
sidecar = f.read().lower()
self.assertIn("this is some text, but in an image, also on page 1.", sidecar)
@@ -673,7 +674,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
"""
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "multi-page-mixed.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
"application/pdf",
)
self.assertIsNone(parser.archive_path)
@@ -685,7 +686,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
@override_settings(OCR_MODE="skip", OCR_ROTATE_PAGES=True)
def test_rotate(self):
parser = RasterisedDocumentParser(None)
parser.parse((self.SAMPLE_FILES / "rotated.pdf").as_posix(), "application/pdf")
parser.parse(os.path.join(self.SAMPLE_FILES, "rotated.pdf"), "application/pdf")
self.assertContainsStrings(
parser.get_text(),
[
@@ -707,7 +708,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
"""
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "multi-page-images.tiff").as_posix(),
os.path.join(self.SAMPLE_FILES, "multi-page-images.tiff"),
"image/tiff",
)
self.assertIsFile(parser.archive_path)
@@ -727,7 +728,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
- Text from all pages extracted
"""
parser = RasterisedDocumentParser(None)
sample_file = self.SAMPLE_FILES / "multi-page-images-alpha.tiff"
sample_file = os.path.join(self.SAMPLE_FILES, "multi-page-images-alpha.tiff")
with tempfile.NamedTemporaryFile() as tmp_file:
shutil.copy(sample_file, tmp_file.name)
parser.parse(
@@ -752,9 +753,10 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
- Text from all pages extracted
"""
parser = RasterisedDocumentParser(None)
sample_file = (
self.SAMPLE_FILES / "multi-page-images-alpha-rgb.tiff"
).as_posix()
sample_file = os.path.join(
self.SAMPLE_FILES,
"multi-page-images-alpha-rgb.tiff",
)
with tempfile.NamedTemporaryFile() as tmp_file:
shutil.copy(sample_file, tmp_file.name)
parser.parse(
@@ -843,7 +845,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "rtl-test.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
"application/pdf",
)
@@ -858,52 +860,49 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertRaises(
ParseError,
parser.parse,
(self.SAMPLE_FILES / "simple-digital.pdf").as_posix(),
os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
"application/pdf",
)
class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = Path(__file__).parent / "samples"
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")
def test_bmp(self):
parser = RasterisedDocumentParser(None)
parser.parse((self.SAMPLE_FILES / "simple.bmp").as_posix(), "image/bmp")
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.bmp"), "image/bmp")
self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower())
def test_jpg(self):
parser = RasterisedDocumentParser(None)
parser.parse((self.SAMPLE_FILES / "simple.jpg").as_posix(), "image/jpeg")
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.jpg"), "image/jpeg")
self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower())
def test_heic(self):
parser = RasterisedDocumentParser(None)
parser.parse((self.SAMPLE_FILES / "simple.heic").as_posix(), "image/heic")
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.heic"), "image/heic")
self.assertIsFile(parser.archive_path)
self.assertIn("pizza", parser.get_text().lower())
@override_settings(OCR_IMAGE_DPI=200)
def test_gif(self):
parser = RasterisedDocumentParser(None)
parser.parse((self.SAMPLE_FILES / "simple.gif").as_posix(), "image/gif")
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.gif"), "image/gif")
self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower())
def test_tiff(self):
parser = RasterisedDocumentParser(None)
parser.parse((self.SAMPLE_FILES / "simple.tif").as_posix(), "image/tiff")
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.tif"), "image/tiff")
self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower())
@override_settings(OCR_IMAGE_DPI=72)
def test_webp(self):
parser = RasterisedDocumentParser(None)
parser.parse(
(self.SAMPLE_FILES / "document.webp").as_posix(),
"image/webp",
)
parser.parse(os.path.join(self.SAMPLE_FILES, "document.webp"), "image/webp")
self.assertIsFile(parser.archive_path)
# Older tesseracts consistently mangle the space between "a webp",
# tesseract 5.3.0 seems to do a better job, so we're accepting both

39
uv.lock generated
View File

@@ -93,6 +93,34 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/af/cc/55a32a2c98022d88812b5986d2a92c4ff3ee087e83b712ebc703bba452bf/Automat-24.8.1-py3-none-any.whl", hash = "sha256:bf029a7bc3da1e2c24da2343e7598affaa9f10bf0ab63ff808566ce90551e02a", size = 42585, upload-time = "2024-08-19T17:31:56.729Z" },
]
[[package]]
name = "azure-ai-documentintelligence"
version = "1.0.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "azure-core", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "isodate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/44/7b/8115cd713e2caa5e44def85f2b7ebd02a74ae74d7113ba20bdd41fd6dd80/azure_ai_documentintelligence-1.0.2.tar.gz", hash = "sha256:4d75a2513f2839365ebabc0e0e1772f5601b3a8c9a71e75da12440da13b63484", size = 170940 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d9/75/c9ec040f23082f54ffb1977ff8f364c2d21c79a640a13d1c1809e7fd6b1a/azure_ai_documentintelligence-1.0.2-py3-none-any.whl", hash = "sha256:e1fb446abbdeccc9759d897898a0fe13141ed29f9ad11fc705f951925822ed59", size = 106005 },
]
[[package]]
name = "azure-core"
version = "1.33.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "six", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/75/aa/7c9db8edd626f1a7d99d09ef7926f6f4fb34d5f9fa00dc394afdfe8e2a80/azure_core-1.33.0.tar.gz", hash = "sha256:f367aa07b5e3005fec2c1e184b882b0b039910733907d001c20fb08ebb8c0eb9", size = 295633 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/07/b7/76b7e144aa53bd206bf1ce34fa75350472c3f69bf30e5c8c18bc9881035d/azure_core-1.33.0-py3-none-any.whl", hash = "sha256:9b5b6d0223a1d38c37500e6971118c1e0f13f54951e6893968b38910bc9cda8f", size = 207071 },
]
[[package]]
name = "babel"
version = "2.17.0"
@@ -1355,6 +1383,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/c7/fc/4e5a141c3f7c7bed550ac1f69e599e92b6be449dd4677ec09f325cad0955/inotifyrecursive-0.3.5-py3-none-any.whl", hash = "sha256:7e5f4a2e1dc2bef0efa3b5f6b339c41fb4599055a2b54909d020e9e932cc8d2f", size = 8009, upload-time = "2020-11-20T12:38:46.981Z" },
]
[[package]]
name = "isodate"
version = "0.7.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320 },
]
[[package]]
name = "jinja2"
version = "3.1.6"
@@ -1883,6 +1920,7 @@ name = "paperless-ngx"
version = "2.16.3"
source = { virtual = "." }
dependencies = [
{ name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "channels", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -2013,6 +2051,7 @@ typing = [
[package.metadata]
requires-dist = [
{ name = "azure-ai-documentintelligence", specifier = ">=1.0.2" },
{ name = "bleach", specifier = "~=6.2.0" },
{ name = "celery", extras = ["redis"], specifier = "~=5.5.1" },
{ name = "channels", specifier = "~=4.2" },