Compare commits

..

18 Commits

Author SHA1 Message Date
shamoon
dd0ffaf312 Merge branch 'dev' into feature-remote-ocr-2 2025-08-11 10:48:36 -07:00
shamoon
264504affc Fix consumer declaration file extensions 2025-08-10 05:32:52 -07:00
shamoon
4feedf2add Merge branch 'dev' into feature-remote-ocr-2 2025-08-06 16:04:25 -04:00
shamoon
2f76cf9831 Merge branch 'dev' into feature-remote-ocr-2 2025-08-01 23:55:49 -04:00
shamoon
1002d37f6b Update test_parser.py 2025-07-09 11:05:37 -07:00
shamoon
d260a94740 Update parsers.py 2025-07-09 11:02:57 -07:00
shamoon
88c69b83ea Update index.md 2025-07-09 11:00:12 -07:00
shamoon
2557ee2014 Update docs to mention remote OCR with Azure AI 2025-07-09 09:53:30 -07:00
shamoon
3c75deed80 Add paperless_remote tests to testpaths 2025-07-08 14:19:45 -07:00
shamoon
d05343c927 Test fixes / coverage 2025-07-08 14:19:45 -07:00
shamoon
e7972b7eaf Coverage 2025-07-08 14:19:45 -07:00
shamoon
75a091cc0d Fix test 2025-07-08 14:19:44 -07:00
shamoon
dca74803fd Use output_content_format poller.result to get clean content 2025-07-08 14:19:44 -07:00
shamoon
3cf3d868d0 Some docs 2025-07-08 14:19:43 -07:00
shamoon
bf4fc6604a Test 2025-07-08 14:19:43 -07:00
shamoon
e8c1eb86fa This actually works
[ci skip]
2025-07-08 14:19:43 -07:00
shamoon
c3dad3cf69 Basic parse 2025-07-08 14:19:42 -07:00
shamoon
811bd66088 Ok, restart implementing this with just azure
[ci skip]
2025-07-08 14:19:42 -07:00
17 changed files with 393 additions and 324 deletions

View File

@@ -5,7 +5,7 @@
# Purpose: Compiles the frontend
# Notes:
# - Does PNPM stuff with Typescript and such
FROM --platform=$BUILDPLATFORM docker.io/node:20-trixie-slim AS compile-frontend
FROM --platform=$BUILDPLATFORM docker.io/node:20-bookworm-slim AS compile-frontend
COPY ./src-ui /src/src-ui
@@ -32,7 +32,7 @@ RUN set -eux \
# Purpose: Installs s6-overlay and rootfs
# Comments:
# - Don't leave anything extra in here either
FROM ghcr.io/astral-sh/uv:0.8.4-python3.12-trixie-slim AS s6-overlay-base
FROM ghcr.io/astral-sh/uv:0.8.4-python3.12-bookworm-slim AS s6-overlay-base
WORKDIR /usr/src/s6
@@ -170,8 +170,20 @@ RUN set -eux \
&& apt-get update \
&& apt-get install --yes --quiet --no-install-recommends ${RUNTIME_PACKAGES} \
&& echo "Installing pre-built updates" \
&& curl --fail --silent --no-progress-meter --show-error --location --remote-name-all \
&& curl --fail --silent --no-progress-meter --show-error --location --remote-name-all --parallel --parallel-max 4 \
https://github.com/paperless-ngx/builder/releases/download/qpdf-${QPDF_VERSION}/libqpdf29_${QPDF_VERSION}-1_${TARGETARCH}.deb \
https://github.com/paperless-ngx/builder/releases/download/qpdf-${QPDF_VERSION}/qpdf_${QPDF_VERSION}-1_${TARGETARCH}.deb \
https://github.com/paperless-ngx/builder/releases/download/ghostscript-${GS_VERSION}/libgs10_${GS_VERSION}.dfsg-1_${TARGETARCH}.deb \
https://github.com/paperless-ngx/builder/releases/download/ghostscript-${GS_VERSION}/ghostscript_${GS_VERSION}.dfsg-1_${TARGETARCH}.deb \
https://github.com/paperless-ngx/builder/releases/download/ghostscript-${GS_VERSION}/libgs10-common_${GS_VERSION}.dfsg-1_all.deb \
https://github.com/paperless-ngx/builder/releases/download/jbig2enc-${JBIG2ENC_VERSION}/jbig2enc_${JBIG2ENC_VERSION}-1_${TARGETARCH}.deb \
&& echo "Installing qpdf ${QPDF_VERSION}" \
&& dpkg --install ./libqpdf29_${QPDF_VERSION}-1_${TARGETARCH}.deb \
&& dpkg --install ./qpdf_${QPDF_VERSION}-1_${TARGETARCH}.deb \
&& echo "Installing Ghostscript ${GS_VERSION}" \
&& dpkg --install ./libgs10-common_${GS_VERSION}.dfsg-1_all.deb \
&& dpkg --install ./libgs10_${GS_VERSION}.dfsg-1_${TARGETARCH}.deb \
&& dpkg --install ./ghostscript_${GS_VERSION}.dfsg-1_${TARGETARCH}.deb \
&& echo "Installing jbig2enc" \
&& dpkg --install ./jbig2enc_${JBIG2ENC_VERSION}-1_${TARGETARCH}.deb \
&& echo "Configuring imagemagick" \

319
dev.txt
View File

@@ -1,319 +0,0 @@
adduser 3.134
apt 2.6.1
base-files 12.4+deb12u11
base-passwd 3.6.1
bash 5.2.15-2+b8
bsdutils 1:2.38.1-5+deb12u3
ca-certificates 20230311+deb12u1
coreutils 9.1-1
curl 7.88.1-10+deb12u12
dash 0.5.12-2
debconf 1.5.82
debian-archive-keyring 2023.3+deb12u2
debianutils 5.7-0.5~deb12u1
diffutils 1:3.8-4
dirmngr 2.2.40-1.1
dpkg 1.21.22
e2fsprogs 1.47.0-2
file 1:5.44-3
findutils 4.9.0-4
fontconfig 2.14.1-4
fontconfig-config 2.14.1-4
fonts-liberation 1:1.07.4-11
fonts-urw-base35 20200910-7
gcc-12-base 12.2.0-14+deb12u1
gettext 0.21-12
gettext-base 0.21-12
ghostscript 10.03.1~dfsg-1
gnupg 2.2.40-1.1
gnupg-l10n 2.2.40-1.1
gnupg-utils 2.2.40-1.1
gosu 1.14-1+b10
gpg 2.2.40-1.1
gpg-agent 2.2.40-1.1
gpg-wks-client 2.2.40-1.1
gpg-wks-server 2.2.40-1.1
gpgconf 2.2.40-1.1
gpgsm 2.2.40-1.1
gpgv 2.2.40-1.1
grep 3.8-5
gzip 1.12-1
hicolor-icon-theme 0.17-2
hostname 3.23+nmu1
icc-profiles-free 2.0.1+dfsg-1.1
imagemagick 8:6.9.11.60+dfsg-1.6+deb12u3
imagemagick-6-common 8:6.9.11.60+dfsg-1.6+deb12u3
imagemagick-6.q16 8:6.9.11.60+dfsg-1.6+deb12u3
init-system-helpers 1.65.2
jbig2dec 0.19-3
jbig2enc 0.30-1
libacl1 2.3.1-3
libaom3 3.6.0-1+deb12u1
libapt-pkg6.0 2.6.1
libarchive13 3.6.2-1+deb12u2
libassuan0 2.5.5-5
libattr1 1:2.5.1-4
libaudit-common 1:3.0.9-1
libaudit1 1:3.0.9-1
libavahi-client3 0.8-10+deb12u1
libavahi-common-data 0.8-10+deb12u1
libavahi-common3 0.8-10+deb12u1
libavcodec59 7:5.1.6-0+deb12u1
libavformat59 7:5.1.6-0+deb12u1
libavutil57 7:5.1.6-0+deb12u1
libblkid1 2.38.1-5+deb12u3
libbluray2 1:1.3.4-1
libbrotli1 1.0.9-2+b6
libbsd0 0.11.7-2
libbz2-1.0 1.0.8-5+b1
libc-bin 2.36-9+deb12u10
libc6 2.36-9+deb12u10
libcairo-gobject2 1.16.0-7
libcairo2 1.16.0-7
libcap-ng0 0.8.3-1+b3
libcap2 1:2.66-4+deb12u1
libchromaprint1 1.5.1-2+b1
libcjson1 1.7.15-1+deb12u2
libcodec2-1.0 1.0.5-1
libcom-err2 1.47.0-2
libconfig-inifiles-perl 3.000003-2
libcrypt1 1:4.4.33-2
libcups2 2.4.2-3+deb12u8
libcurl4 7.88.1-10+deb12u12
libdatrie1 0.2.13-2+b1
libdav1d6 1.0.0-2+deb12u1
libdb5.3 5.3.28+dfsg2-1
libdbus-1-3 1.14.10-1~deb12u1
libde265-0 1.0.11-1+deb12u2
libdebconfclient0 0.270
libdeflate0 1.14-1
libdrm-common 2.4.114-1
libdrm2 2.4.114-1+b1
libedit2 3.1-20221030-2
libexpat1 2.5.0-1+deb12u1
libext2fs2 1.47.0-2
libffi8 3.4.4-1
libfftw3-double3 3.3.10-1
libfontconfig1 2.14.1-4
libfontenc1 1:1.1.4-1
libfreetype6 2.12.1+dfsg-5+deb12u4
libfribidi0 1.0.8-2.1
libgcc-s1 12.2.0-14+deb12u1
libgcrypt20 1.10.1-3
libgdbm-compat4 1.23-3
libgdbm6 1.23-3
libgdk-pixbuf-2.0-0 2.42.10+dfsg-1+deb12u2
libgdk-pixbuf2.0-common 2.42.10+dfsg-1+deb12u2
libgif7 5.2.1-2.5
libglib2.0-0 2.74.6-2+deb12u6
libgme0 0.6.3-6
libgmp10 2:6.2.1+dfsg1-1.1
libgnutls30 3.7.9-2+deb12u5
libgomp1 12.2.0-14+deb12u1
libgpg-error0 1.46-1
libgraphite2-3 1.3.14-1
libgs-common 10.0.0~dfsg-11+deb12u7
libgs10 10.03.1~dfsg-1
libgs10-common 10.03.1~dfsg-1
libgsm1 1.0.22-1
libgssapi-krb5-2 1.20.1-2+deb12u3
libharfbuzz0b 6.0.0+dfsg-3
libheif1 1.15.1-1+deb12u1
libhogweed6 3.8.1-2
libhwy1 1.0.3-3+deb12u1
libice6 2:1.0.10-1
libicu72 72.1-3+deb12u1
libidn12 1.41-1
libidn2-0 2.3.3-1+b1
libijs-0.35 0.35-15
libimagequant0 2.17.0-1
libjbig0 2.1-6.1
libjbig2dec0 0.19-3
libjpeg62-turbo 1:2.1.5-2
libjxl0.7 0.7.0-10+deb12u1
libk5crypto3 1.20.1-2+deb12u3
libkeyutils1 1.6.3-2
libkrb5-3 1.20.1-2+deb12u3
libkrb5support0 1.20.1-2+deb12u3
libksba8 1.6.3-2
liblcms2-2 2.14-2
libldap-2.5-0 2.5.13+dfsg-5
liblept5 1.82.0-3+b3
liblerc4 4.0.0+ds-2
liblqr-1-0 0.4.2-2.1
libltdl7 2.4.7-7~deb12u1
liblz4-1 1.9.4-1
liblzma5 5.4.1-1
libmagic-mgc 1:5.44-3
libmagic1 1:5.44-3
libmagickcore-6.q16-6 8:6.9.11.60+dfsg-1.6+deb12u3
libmagickwand-6.q16-6 8:6.9.11.60+dfsg-1.6+deb12u3
libmariadb3 1:10.11.11-0+deb12u1
libmbedcrypto7 2.28.3-1
libmd0 1.0.4-2
libmfx1 22.5.4-1
libmount1 2.38.1-5+deb12u3
libmp3lame0 3.100-6
libmpg123-0 1.31.2-1+deb12u1
libncurses6 6.4-4
libncursesw6 6.4-4
libnettle8 3.8.1-2
libnghttp2-14 1.52.0-1+deb12u2
libnorm1 1.5.9+dfsg-2
libnpth0 1.6-3
libnsl2 1.3.0-2
libnspr4 2:4.35-1
libnss3 2:3.87.1-1+deb12u1
libnuma1 2.0.16-1
libogg0 1.3.5-3
libopenjp2-7 2.5.0-2+deb12u1
libopenmpt0 0.6.9-1
libopus0 1.3.1-3
libp11-kit0 0.24.1-2
libpam-modules 1.5.2-6+deb12u1
libpam-modules-bin 1.5.2-6+deb12u1
libpam-runtime 1.5.2-6+deb12u1
libpam0g 1.5.2-6+deb12u1
libpango-1.0-0 1.50.12+ds-1
libpangocairo-1.0-0 1.50.12+ds-1
libpangoft2-1.0-0 1.50.12+ds-1
libpaper1 1.1.29
libpcre2-8-0 10.42-1
libperl5.36 5.36.0-7+deb12u2
libpgm-5.3-0 5.3.128~dfsg-2
libpixman-1-0 0.42.2-1
libpng16-16 1.6.39-2
libpoppler126 22.12.0-2+deb12u1
libpq5 15.13-0+deb12u1
libpsl5 0.21.2-1
libqpdf29 11.9.0-1
librabbitmq4 0.11.0-1+deb12u1
librav1e0 0.5.1-6
libreadline8 8.2-1.3
librist4 0.2.7+dfsg-1
librsvg2-2 2.54.7+dfsg-1~deb12u1
librtmp1 2.4+20151223.gitfa8646d.1-2+b2
libsasl2-2 2.1.28+dfsg-10
libsasl2-modules-db 2.1.28+dfsg-10
libseccomp2 2.5.4-1+deb12u1
libselinux1 3.4-1+b6
libsemanage-common 3.4-1
libsemanage2 3.4-1+b5
libsepol2 3.4-2.1
libshine3 3.1.1-2
libsm6 2:1.2.3-1
libsmartcols1 2.38.1-5+deb12u3
libsnappy1v5 1.1.9-3
libsodium23 1.0.18-1
libsoxr0 0.1.3-4
libspeex1 1.2.1-2
libsqlite3-0 3.40.1-2+deb12u1
libsrt1.5-gnutls 1.5.1-1+deb12u1
libss2 1.47.0-2
libssh-gcrypt-4 0.10.6-0+deb12u1
libssh2-1 1.10.0-3+b1
libssl3 3.0.17-1~deb12u1
libstdc++6 12.2.0-14+deb12u1
libsvtav1enc1 1.4.1+dfsg-1
libswresample4 7:5.1.6-0+deb12u1
libsystemd0 252.38-1~deb12u1
libtasn1-6 4.19.0-2+deb12u1
libtesseract5 5.3.0-2
libthai-data 0.1.29-1
libthai0 0.1.29-1
libtheora0 1.1.1+dfsg.1-16.1+b1
libtiff6 4.5.0-6+deb12u2
libtinfo6 6.4-4
libtirpc-common 1.3.3+ds-1
libtirpc3 1.3.3+ds-1
libtwolame0 0.4.0-2
libudev1 252.38-1~deb12u1
libudfread0 1.1.2-1
libunistring2 1.0-2
libuuid1 2.38.1-5+deb12u3
libv4l-0 1.22.1-5+b2
libv4lconvert0 1.22.1-5+b2
libva-drm2 2.17.0-1
libva-x11-2 2.17.0-1
libva2 2.17.0-1
libvdpau1 1.5-2
libvorbis0a 1.3.7-1
libvorbisenc2 1.3.7-1
libvorbisfile3 1.3.7-1
libvpx7 1.12.0-1+deb12u4
libwebp7 1.2.4-0.2+deb12u1
libwebpdemux2 1.2.4-0.2+deb12u1
libwebpmux3 1.2.4-0.2+deb12u1
libx11-6 2:1.8.4-2+deb12u2
libx11-data 2:1.8.4-2+deb12u2
libx11-xcb1 2:1.8.4-2+deb12u2
libx264-164 2:0.164.3095+gitbaee400-3
libx265-199 3.5-2+b1
libxau6 1:1.0.9-1
libxcb-dri3-0 1.15-1
libxcb-render0 1.15-1
libxcb-shm0 1.15-1
libxcb1 1.15-1
libxdmcp6 1:1.1.2-3
libxext6 2:1.3.4-1+b1
libxfixes3 1:6.0.0-2
libxml2 2.9.14+dfsg-1.3~deb12u2
libxrender1 1:0.9.10-1.1
libxslt1.1 1.1.35-1+deb12u1
libxt6 1:1.2.1-1.1
libxvidcore4 2:1.3.7-1
libxxhash0 0.8.1-1
libzbar0 0.23.92-7+deb12u1
libzmq5 4.3.4-6
libzstd1 1.5.4+dfsg2-5
libzvbi-common 0.2.41-1
libzvbi0 0.2.41-1
login 1:4.13+dfsg1-1+deb12u1
logsave 1.47.0-2
mariadb-client 1:10.11.11-0+deb12u1
mariadb-client-core 1:10.11.11-0+deb12u1
mariadb-common 1:10.11.11-0+deb12u1
mawk 1.3.4.20200120-3.1
media-types 10.0.0
mount 2.38.1-5+deb12u3
mysql-common 5.8+1.1.0
ncurses-base 6.4-4
ncurses-bin 6.4-4
netbase 6.4
ocl-icd-libopencl1 2.3.1-1
openssl 3.0.17-1~deb12u1
passwd 1:4.13+dfsg1-1+deb12u1
perl 5.36.0-7+deb12u2
perl-base 5.36.0-7+deb12u2
perl-modules-5.36 5.36.0-7+deb12u2
pinentry-curses 1.2.1-1
pngquant 2.17.0-1
poppler-data 0.4.12-1
poppler-utils 22.12.0-2+deb12u1
postgresql-client 15+248
postgresql-client-15 15.13-0+deb12u1
postgresql-client-common 248
qpdf 11.9.0-1
readline-common 8.2-1.3
sed 4.9-1
sensible-utils 0.0.17+nmu1
shared-mime-info 2.2-1
sysvinit-utils 3.06-4
tar 1.34+dfsg-1.2+deb12u1
tesseract-ocr 5.3.0-2
tesseract-ocr-deu 1:4.1.0-2
tesseract-ocr-eng 1:4.1.0-2
tesseract-ocr-fra 1:4.1.0-2
tesseract-ocr-ita 1:4.1.0-2
tesseract-ocr-osd 1:4.1.0-2
tesseract-ocr-spa 1:4.1.0-2
tzdata 2025b-0+deb12u1
ucf 3.0043+nmu1+deb12u1
unpaper 7.0.0-0.1
usr-is-merged 37~deb12u1
util-linux 2.38.1-5+deb12u3
util-linux-extra 2.38.1-5+deb12u3
x11-common 1:7.7+23
xfonts-encodings 1:1.0.4-2.2
xfonts-utils 1:7.7+6
zlib1g 1:1.2.13.dfsg-1

View File

@@ -1800,3 +1800,23 @@ password. All of these options come from their similarly-named [Django settings]
#### [`PAPERLESS_EMAIL_USE_SSL=<bool>`](#PAPERLESS_EMAIL_USE_SSL) {#PAPERLESS_EMAIL_USE_SSL}
: Defaults to false.
## Remote OCR
#### [`PAPERLESS_REMOTE_OCR_ENGINE=<str>`](#PAPERLESS_REMOTE_OCR_ENGINE) {#PAPERLESS_REMOTE_OCR_ENGINE}
: The remote OCR engine to use. Currently only Azure AI is supported as "azureai".
Defaults to None, which disables remote OCR.
#### [`PAPERLESS_REMOTE_OCR_API_KEY=<str>`](#PAPERLESS_REMOTE_OCR_API_KEY) {#PAPERLESS_REMOTE_OCR_API_KEY}
: The API key to use for the remote OCR engine.
Defaults to None.
#### [`PAPERLESS_REMOTE_OCR_ENDPOINT=<str>`](#PAPERLESS_REMOTE_OCR_ENDPOINT) {#PAPERLESS_REMOTE_OCR_ENDPOINT}
: The endpoint to use for the remote OCR engine. This is required for Azure AI.
Defaults to None.

View File

@@ -25,9 +25,10 @@ physical documents into a searchable online archive so you can keep, well, _less
## Features
- **Organize and index** your scanned documents with tags, correspondents, types, and more.
- _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way.
- _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way, unless you explicitly choose to do so.
- Performs **OCR** on your documents, adding searchable and selectable text, even to documents scanned with only images.
- Utilizes the open-source Tesseract engine to recognize more than 100 languages.
- Utilizes the open-source Tesseract engine to recognize more than 100 languages.
- _New!_ Supports remote OCR with Azure AI (opt-in).
- Documents are saved as PDF/A format which is designed for long term storage, alongside the unaltered originals.
- Uses machine-learning to automatically add tags, correspondents and document types to your documents.
- Supports PDF documents, images, plain text files, Office documents (Word, Excel, PowerPoint, and LibreOffice equivalents)[^1] and more.

View File

@@ -850,6 +850,18 @@ how regularly you intend to scan documents and use paperless.
performed the task associated with the document, move it to the
inbox.
## Remote OCR
!!! important
This feature is disabled by default and will always remain strictly "opt-in".
Paperless-ngx supports performing OCR on documents using remote services. At the moment, this is limited to
[Microsoft's Azure "Document Intelligence" service](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence).
This is of course a paid service (with a free tier) which requires an Azure account and subscription. Azure AI is not affiliated with
Paperless-ngx in any way. When enabled, Paperless-ngx will automatically send appropriate documents to Azure for OCR processing, bypassing
the local OCR engine. See the [configuration](configuration.md#PAPERLESS_REMOTE_OCR_ENGINE) options for more details.
## Architecture
Paperless-ngx consists of the following components:

View File

@@ -15,6 +15,7 @@ classifiers = [
# This will allow testing to not install a webserver, mysql, etc
dependencies = [
"azure-ai-documentintelligence>=1.0.2",
"bleach~=6.2.0",
"celery[redis]~=5.5.1",
"channels~=4.2",
@@ -233,6 +234,7 @@ testpaths = [
"src/paperless_tesseract/tests/",
"src/paperless_tika/tests",
"src/paperless_text/tests/",
"src/paperless_remote/tests/",
]
addopts = [
"--pythonwarnings=all",

View File

@@ -324,6 +324,7 @@ INSTALLED_APPS = [
"paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"paperless_mail.apps.PaperlessMailConfig",
"paperless_remote.apps.PaperlessRemoteParserConfig",
"django.contrib.admin",
"rest_framework",
"rest_framework.authtoken",
@@ -1443,3 +1444,10 @@ WEBHOOKS_ALLOW_INTERNAL_REQUESTS = __get_boolean(
"PAPERLESS_WEBHOOKS_ALLOW_INTERNAL_REQUESTS",
"true",
)
###############################################################################
# Remote Parser #
###############################################################################
REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")

View File

@@ -0,0 +1,4 @@
# this is here so that django finds the checks.
from paperless_remote.checks import check_remote_parser_configured
__all__ = ["check_remote_parser_configured"]

View File

@@ -0,0 +1,14 @@
from django.apps import AppConfig
from paperless_remote.signals import remote_consumer_declaration
class PaperlessRemoteParserConfig(AppConfig):
name = "paperless_remote"
def ready(self):
from documents.signals import document_consumer_declaration
document_consumer_declaration.connect(remote_consumer_declaration)
AppConfig.ready(self)

View File

@@ -0,0 +1,15 @@
from django.conf import settings
from django.core.checks import Error
from django.core.checks import register
@register()
def check_remote_parser_configured(app_configs, **kwargs):
if settings.REMOTE_OCR_ENGINE == "azureai" and not settings.REMOTE_OCR_ENDPOINT:
return [
Error(
"Azure AI remote parser requires endpoint to be configured.",
),
]
return []

View File

@@ -0,0 +1,113 @@
from pathlib import Path
from django.conf import settings
from paperless_tesseract.parsers import RasterisedDocumentParser
class RemoteEngineConfig:
def __init__(
self,
engine: str,
api_key: str | None = None,
endpoint: str | None = None,
):
self.engine = engine
self.api_key = api_key
self.endpoint = endpoint
def engine_is_valid(self):
valid = self.engine in ["azureai"] and self.api_key is not None
if self.engine == "azureai":
valid = valid and self.endpoint is not None
return valid
class RemoteDocumentParser(RasterisedDocumentParser):
"""
This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
as this is the only service that provides a remote OCR API with text-embedded PDF output.
"""
logging_name = "paperless.parsing.remote"
def get_settings(self) -> RemoteEngineConfig:
"""
Returns the configuration for the remote OCR engine, loaded from Django settings.
"""
return RemoteEngineConfig(
engine=settings.REMOTE_OCR_ENGINE,
api_key=settings.REMOTE_OCR_API_KEY,
endpoint=settings.REMOTE_OCR_ENDPOINT,
)
def supported_mime_types(self):
if self.settings.engine_is_valid():
return {
"application/pdf": ".pdf",
"image/png": ".png",
"image/jpeg": ".jpg",
"image/tiff": ".tiff",
"image/bmp": ".bmp",
"image/gif": ".gif",
"image/webp": ".webp",
}
else:
return {}
def azure_ai_vision_parse(
self,
file: Path,
) -> str | None:
"""
Uses Azure AI Vision to parse the document and return the text content.
It requests a searchable PDF output with embedded text.
The PDF is saved to the archive_path attribute.
Returns the text content extracted from the document.
If the parsing fails, it returns None.
"""
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from azure.ai.documentintelligence.models import AnalyzeOutputOption
from azure.ai.documentintelligence.models import DocumentContentFormat
from azure.core.credentials import AzureKeyCredential
client = DocumentIntelligenceClient(
endpoint=self.settings.endpoint,
credential=AzureKeyCredential(self.settings.api_key),
)
with file.open("rb") as f:
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
poller = client.begin_analyze_document(
model_id="prebuilt-read",
body=analyze_request,
output_content_format=DocumentContentFormat.TEXT,
output=[AnalyzeOutputOption.PDF], # request searchable PDF output
content_type="application/json",
)
poller.wait()
result_id = poller.details["operation_id"]
result = poller.result()
# Download the PDF with embedded text
self.archive_path = Path(self.tempdir) / "archive.pdf"
with self.archive_path.open("wb") as f:
for chunk in client.get_analyze_result_pdf(
model_id="prebuilt-read",
result_id=result_id,
):
f.write(chunk)
return result.content
def parse(self, document_path: Path, mime_type, file_name=None):
if not self.settings.engine_is_valid():
self.log.warning(
"No valid remote parser engine is configured, content will be empty.",
)
self.text = ""
return
elif self.settings.engine == "azureai":
self.text = self.azure_ai_vision_parse(document_path)

View File

@@ -0,0 +1,18 @@
def get_parser(*args, **kwargs):
from paperless_remote.parsers import RemoteDocumentParser
return RemoteDocumentParser(*args, **kwargs)
def get_supported_mime_types():
from paperless_remote.parsers import RemoteDocumentParser
return RemoteDocumentParser(None).supported_mime_types()
def remote_consumer_declaration(sender, **kwargs):
return {
"parser": get_parser,
"weight": 5,
"mime_types": get_supported_mime_types(),
}

View File

Binary file not shown.

View File

@@ -0,0 +1,29 @@
from django.test import TestCase
from django.test import override_settings
from paperless_remote import check_remote_parser_configured
class TestChecks(TestCase):
@override_settings(REMOTE_OCR_ENGINE=None)
def test_no_engine(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 0)
@override_settings(REMOTE_OCR_ENGINE="azureai")
@override_settings(REMOTE_OCR_API_KEY="somekey")
@override_settings(REMOTE_OCR_ENDPOINT=None)
def test_azure_no_endpoint(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 1)
self.assertTrue(
msgs[0].msg.startswith(
"Azure AI remote parser requires endpoint to be configured.",
),
)
@override_settings(REMOTE_OCR_ENGINE="something")
@override_settings(REMOTE_OCR_API_KEY="somekey")
def test_valid_configuration(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 0)

View File

@@ -0,0 +1,101 @@
import uuid
from pathlib import Path
from unittest import mock
from django.test import TestCase
from django.test import override_settings
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless_remote.parsers import RemoteDocumentParser
from paperless_remote.signals import get_parser
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
def assertContainsStrings(self, content, strings):
# Asserts that all strings appear in content, in the given order.
indices = []
for s in strings:
if s in content:
indices.append(content.index(s))
else:
self.fail(f"'{s}' is not in '{content}'")
self.assertListEqual(indices, sorted(indices))
@mock.patch("paperless_tesseract.parsers.run_subprocess")
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
def test_get_text_with_azure(self, mock_client_cls, mock_subprocess):
# Arrange mock Azure client
mock_client = mock.Mock()
mock_client_cls.return_value = mock_client
# Simulate poller result and its `.details`
mock_poller = mock.Mock()
mock_poller.wait.return_value = None
mock_poller.details = {"operation_id": "fake-op-id"}
mock_client.begin_analyze_document.return_value = mock_poller
mock_poller.result.return_value.content = "This is a test document."
# Return dummy PDF bytes
mock_client.get_analyze_result_pdf.return_value = [
b"%PDF-",
b"1.7 ",
b"FAKEPDF",
]
# Simulate pdftotext by writing dummy text to sidecar file
def fake_run(cmd, *args, **kwargs):
with Path(cmd[-1]).open("w", encoding="utf-8") as f:
f.write("This is a test document.")
mock_subprocess.side_effect = fake_run
with override_settings(
REMOTE_OCR_ENGINE="azureai",
REMOTE_OCR_API_KEY="somekey",
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
):
parser = get_parser(uuid.uuid4())
parser.parse(
self.SAMPLE_FILES / "simple-digital.pdf",
"application/pdf",
)
self.assertContainsStrings(
parser.text.strip(),
["This is a test document."],
)
@override_settings(
REMOTE_OCR_ENGINE="azureai",
REMOTE_OCR_API_KEY="key",
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
)
def test_supported_mime_types_valid_config(self):
parser = RemoteDocumentParser(uuid.uuid4())
expected_types = {
"application/pdf": ".pdf",
"image/png": ".png",
"image/jpeg": ".jpg",
"image/tiff": ".tiff",
"image/bmp": ".bmp",
"image/gif": ".gif",
"image/webp": ".webp",
}
self.assertEqual(parser.supported_mime_types(), expected_types)
def test_supported_mime_types_invalid_config(self):
parser = get_parser(uuid.uuid4())
self.assertEqual(parser.supported_mime_types(), {})
@override_settings(
REMOTE_OCR_ENGINE=None,
REMOTE_OCR_API_KEY=None,
REMOTE_OCR_ENDPOINT=None,
)
def test_parse_with_invalid_config(self):
parser = get_parser(uuid.uuid4())
parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf")
self.assertEqual(parser.text, "")

39
uv.lock generated
View File

@@ -93,6 +93,34 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/af/cc/55a32a2c98022d88812b5986d2a92c4ff3ee087e83b712ebc703bba452bf/Automat-24.8.1-py3-none-any.whl", hash = "sha256:bf029a7bc3da1e2c24da2343e7598affaa9f10bf0ab63ff808566ce90551e02a", size = 42585, upload-time = "2024-08-19T17:31:56.729Z" },
]
[[package]]
name = "azure-ai-documentintelligence"
version = "1.0.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "azure-core", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "isodate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/44/7b/8115cd713e2caa5e44def85f2b7ebd02a74ae74d7113ba20bdd41fd6dd80/azure_ai_documentintelligence-1.0.2.tar.gz", hash = "sha256:4d75a2513f2839365ebabc0e0e1772f5601b3a8c9a71e75da12440da13b63484", size = 170940 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d9/75/c9ec040f23082f54ffb1977ff8f364c2d21c79a640a13d1c1809e7fd6b1a/azure_ai_documentintelligence-1.0.2-py3-none-any.whl", hash = "sha256:e1fb446abbdeccc9759d897898a0fe13141ed29f9ad11fc705f951925822ed59", size = 106005 },
]
[[package]]
name = "azure-core"
version = "1.33.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "six", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/75/aa/7c9db8edd626f1a7d99d09ef7926f6f4fb34d5f9fa00dc394afdfe8e2a80/azure_core-1.33.0.tar.gz", hash = "sha256:f367aa07b5e3005fec2c1e184b882b0b039910733907d001c20fb08ebb8c0eb9", size = 295633 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/07/b7/76b7e144aa53bd206bf1ce34fa75350472c3f69bf30e5c8c18bc9881035d/azure_core-1.33.0-py3-none-any.whl", hash = "sha256:9b5b6d0223a1d38c37500e6971118c1e0f13f54951e6893968b38910bc9cda8f", size = 207071 },
]
[[package]]
name = "babel"
version = "2.17.0"
@@ -1383,6 +1411,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/c7/fc/4e5a141c3f7c7bed550ac1f69e599e92b6be449dd4677ec09f325cad0955/inotifyrecursive-0.3.5-py3-none-any.whl", hash = "sha256:7e5f4a2e1dc2bef0efa3b5f6b339c41fb4599055a2b54909d020e9e932cc8d2f", size = 8009, upload-time = "2020-11-20T12:38:46.981Z" },
]
[[package]]
name = "isodate"
version = "0.7.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320 },
]
[[package]]
name = "jinja2"
version = "3.1.6"
@@ -1911,6 +1948,7 @@ name = "paperless-ngx"
version = "2.17.1"
source = { virtual = "." }
dependencies = [
{ name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "channels", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -2044,6 +2082,7 @@ typing = [
[package.metadata]
requires-dist = [
{ name = "azure-ai-documentintelligence", specifier = ">=1.0.2" },
{ name = "bleach", specifier = "~=6.2.0" },
{ name = "celery", extras = ["redis"], specifier = "~=5.5.1" },
{ name = "channels", specifier = "~=4.2" },