Compare commits

..

1 Commits

Author SHA1 Message Date
shamoon
1b6a9d3816 Add info buttons for core metadata items 2025-08-04 23:45:50 -04:00
37 changed files with 355 additions and 1081 deletions

View File

@@ -15,7 +15,6 @@ env:
DEFAULT_UV_VERSION: "0.8.x"
# This is the default version of Python to use in most steps which aren't specific
DEFAULT_PYTHON_VERSION: "3.11"
NLTK_DATA: "/usr/share/nltk_data"
jobs:
pre-commit:
# We want to run on external PRs, but not on our own internal PRs as they'll be run
@@ -122,11 +121,8 @@ jobs:
- name: List installed Python dependencies
run: |
uv pip list
- name: Install or update NLTK dependencies
run: uv run python -m nltk.downloader punkt punkt_tab snowball_data stopwords -d ${{ env.NLTK_DATA }}
- name: Tests
env:
NLTK_DATA: ${{ env.NLTK_DATA }}
PAPERLESS_CI_TEST: 1
# Enable paperless_mail testing against real server
PAPERLESS_MAIL_TEST_HOST: ${{ secrets.TEST_MAIL_HOST }}

View File

@@ -31,7 +31,7 @@ repos:
rev: v2.4.1
hooks:
- id: codespell
exclude: "(^src-ui/src/locale/)|(^src-ui/pnpm-lock.yaml)|(^src-ui/e2e/)|(^src/paperless_mail/tests/samples/)|(^src/documents/tests/samples/)"
exclude: "(^src-ui/src/locale/)|(^src-ui/pnpm-lock.yaml)|(^src-ui/e2e/)|(^src/paperless_mail/tests/samples/)"
exclude_types:
- pofile
- json

View File

@@ -1776,23 +1776,3 @@ password. All of these options come from their similarly-named [Django settings]
#### [`PAPERLESS_EMAIL_USE_SSL=<bool>`](#PAPERLESS_EMAIL_USE_SSL) {#PAPERLESS_EMAIL_USE_SSL}
: Defaults to false.
## Remote OCR
#### [`PAPERLESS_REMOTE_OCR_ENGINE=<str>`](#PAPERLESS_REMOTE_OCR_ENGINE) {#PAPERLESS_REMOTE_OCR_ENGINE}
: The remote OCR engine to use. Currently only Azure AI is supported as "azureai".
Defaults to None, which disables remote OCR.
#### [`PAPERLESS_REMOTE_OCR_API_KEY=<str>`](#PAPERLESS_REMOTE_OCR_API_KEY) {#PAPERLESS_REMOTE_OCR_API_KEY}
: The API key to use for the remote OCR engine.
Defaults to None.
#### [`PAPERLESS_REMOTE_OCR_ENDPOINT=<str>`](#PAPERLESS_REMOTE_OCR_ENDPOINT) {#PAPERLESS_REMOTE_OCR_ENDPOINT}
: The endpoint to use for the remote OCR engine. This is required for Azure AI.
Defaults to None.

View File

@@ -25,10 +25,9 @@ physical documents into a searchable online archive so you can keep, well, _less
## Features
- **Organize and index** your scanned documents with tags, correspondents, types, and more.
- _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way, unless you explicitly choose to do so.
- _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way.
- Performs **OCR** on your documents, adding searchable and selectable text, even to documents scanned with only images.
- Utilizes the open-source Tesseract engine to recognize more than 100 languages.
- _New!_ Supports remote OCR with Azure AI (opt-in).
- Utilizes the open-source Tesseract engine to recognize more than 100 languages.
- Documents are saved as PDF/A format which is designed for long term storage, alongside the unaltered originals.
- Uses machine-learning to automatically add tags, correspondents and document types to your documents.
- Supports PDF documents, images, plain text files, Office documents (Word, Excel, PowerPoint, and LibreOffice equivalents)[^1] and more.

View File

@@ -844,18 +844,6 @@ how regularly you intend to scan documents and use paperless.
performed the task associated with the document, move it to the
inbox.
## Remote OCR
!!! important
This feature is disabled by default and will always remain strictly "opt-in".
Paperless-ngx supports performing OCR on documents using remote services. At the moment, this is limited to
[Microsoft's Azure "Document Intelligence" service](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence).
This is of course a paid service (with a free tier) which requires an Azure account and subscription. Azure AI is not affiliated with
Paperless-ngx in any way. When enabled, Paperless-ngx will automatically send appropriate documents to Azure for OCR processing, bypassing
the local OCR engine. See the [configuration](configuration.md#PAPERLESS_REMOTE_OCR_ENGINE) options for more details.
## Architecture
Paperless-ngx consists of the following components:

View File

@@ -15,7 +15,6 @@ classifiers = [
# This will allow testing to not install a webserver, mysql, etc
dependencies = [
"azure-ai-documentintelligence>=1.0.2",
"bleach~=6.2.0",
"celery[redis]~=5.5.1",
"channels~=4.2",
@@ -64,7 +63,7 @@ dependencies = [
"redis[hiredis]~=5.2.1",
"scikit-learn~=1.7.0",
"setproctitle~=1.3.4",
"tika-client~=0.10.0",
"tika-client~=0.9.0",
"tqdm~=4.67.1",
"watchdog~=6.0",
"whitenoise~=6.9",
@@ -205,9 +204,15 @@ lint.per-file-ignores."docker/wait-for-redis.py" = [
"INP001",
"T201",
]
lint.per-file-ignores."src/documents/file_handling.py" = [
"PTH",
] # TODO Enable & remove
lint.per-file-ignores."src/documents/management/commands/document_consumer.py" = [
"PTH",
] # TODO Enable & remove
lint.per-file-ignores."src/documents/management/commands/document_exporter.py" = [
"PTH",
] # TODO Enable & remove
lint.per-file-ignores."src/documents/migrations/1012_fix_archive_files.py" = [
"PTH",
] # TODO Enable & remove
@@ -217,6 +222,9 @@ lint.per-file-ignores."src/documents/models.py" = [
lint.per-file-ignores."src/documents/parsers.py" = [
"PTH",
] # TODO Enable & remove
lint.per-file-ignores."src/documents/signals/handlers.py" = [
"PTH",
] # TODO Enable & remove
lint.per-file-ignores."src/paperless_tesseract/tests/test_parser.py" = [
"RUF001",
]
@@ -234,7 +242,6 @@ testpaths = [
"src/paperless_tesseract/tests/",
"src/paperless_tika/tests",
"src/paperless_text/tests/",
"src/paperless_remote/tests/",
]
addopts = [
"--pythonwarnings=all",

View File

@@ -332,19 +332,19 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">103</context>
<context context-type="linenumber">102</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">103</context>
<context context-type="linenumber">102</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">103</context>
<context context-type="linenumber">102</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">103</context>
<context context-type="linenumber">102</context>
</context-group>
</trans-unit>
<trans-unit id="4930506384627295710" datatype="html">
@@ -545,7 +545,7 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/settings/settings.component.html</context>
<context context-type="linenumber">362</context>
<context context-type="linenumber">361</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/edit-dialog/correspondent-edit-dialog/correspondent-edit-dialog.component.html</context>
@@ -605,7 +605,7 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/saved-views/saved-views.component.html</context>
<context context-type="linenumber">74</context>
<context context-type="linenumber">73</context>
</context-group>
</trans-unit>
<trans-unit id="5079885666748292382" datatype="html">
@@ -763,19 +763,19 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">52</context>
<context context-type="linenumber">51</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">52</context>
<context context-type="linenumber">51</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">52</context>
<context context-type="linenumber">51</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">52</context>
<context context-type="linenumber">51</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/saved-views/saved-views.component.html</context>
@@ -1225,19 +1225,19 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">7</context>
<context context-type="linenumber">6</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">7</context>
<context context-type="linenumber">6</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">7</context>
<context context-type="linenumber">6</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">7</context>
<context context-type="linenumber">6</context>
</context-group>
</trans-unit>
<trans-unit id="309314153079578337" datatype="html">
@@ -1432,7 +1432,7 @@
<source>Cancel</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/admin/settings/settings.component.html</context>
<context context-type="linenumber">361</context>
<context context-type="linenumber">362</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/common/confirm-dialog/confirm-dialog.component.ts</context>
@@ -1500,7 +1500,7 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/saved-views/saved-views.component.html</context>
<context context-type="linenumber">73</context>
<context context-type="linenumber">74</context>
</context-group>
</trans-unit>
<trans-unit id="6839066544204061364" datatype="html">
@@ -1598,19 +1598,19 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">4</context>
<context context-type="linenumber">3</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">4</context>
<context context-type="linenumber">3</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">4</context>
<context context-type="linenumber">3</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">4</context>
<context context-type="linenumber">3</context>
</context-group>
</trans-unit>
<trans-unit id="4880728824338713664" datatype="html">
@@ -1696,35 +1696,35 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">21</context>
<context context-type="linenumber">20</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">21</context>
<context context-type="linenumber">20</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">21</context>
<context context-type="linenumber">20</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">21</context>
<context context-type="linenumber">20</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">38</context>
<context context-type="linenumber">37</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">38</context>
<context context-type="linenumber">37</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">38</context>
<context context-type="linenumber">37</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">38</context>
<context context-type="linenumber">37</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/workflows/workflows.component.html</context>
@@ -1816,19 +1816,19 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">44</context>
<context context-type="linenumber">43</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">44</context>
<context context-type="linenumber">43</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">44</context>
<context context-type="linenumber">43</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">44</context>
<context context-type="linenumber">43</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/saved-views/saved-views.component.html</context>
@@ -2121,51 +2121,51 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">10</context>
<context context-type="linenumber">9</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">10</context>
<context context-type="linenumber">9</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">10</context>
<context context-type="linenumber">9</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">10</context>
<context context-type="linenumber">9</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">85</context>
<context context-type="linenumber">84</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">85</context>
<context context-type="linenumber">84</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">85</context>
<context context-type="linenumber">84</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">85</context>
<context context-type="linenumber">84</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">97</context>
<context context-type="linenumber">96</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">97</context>
<context context-type="linenumber">96</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">97</context>
<context context-type="linenumber">96</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">97</context>
<context context-type="linenumber">96</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.ts</context>
@@ -2440,35 +2440,35 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">84</context>
<context context-type="linenumber">83</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">84</context>
<context context-type="linenumber">83</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">84</context>
<context context-type="linenumber">83</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">84</context>
<context context-type="linenumber">83</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">94</context>
<context context-type="linenumber">93</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">94</context>
<context context-type="linenumber">93</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">94</context>
<context context-type="linenumber">93</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">94</context>
<context context-type="linenumber">93</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/workflows/workflows.component.html</context>
@@ -5227,19 +5227,19 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">13</context>
<context context-type="linenumber">12</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">13</context>
<context context-type="linenumber">12</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">13</context>
<context context-type="linenumber">12</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">13</context>
<context context-type="linenumber">12</context>
</context-group>
</trans-unit>
<trans-unit id="4391289919356861627" datatype="html">
@@ -8333,19 +8333,19 @@
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">87</context>
<context context-type="linenumber">86</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">87</context>
<context context-type="linenumber">86</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">87</context>
<context context-type="linenumber">86</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">87</context>
<context context-type="linenumber">86</context>
</context-group>
</trans-unit>
<trans-unit id="651372623796033489" datatype="html">
@@ -8672,76 +8672,76 @@
<source>Filter by:</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">20</context>
<context context-type="linenumber">19</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">20</context>
<context context-type="linenumber">19</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">20</context>
<context context-type="linenumber">19</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">20</context>
<context context-type="linenumber">19</context>
</context-group>
</trans-unit>
<trans-unit id="1383365546483928780" datatype="html">
<source>Matching</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">39</context>
<context context-type="linenumber">38</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">39</context>
<context context-type="linenumber">38</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">39</context>
<context context-type="linenumber">38</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">39</context>
<context context-type="linenumber">38</context>
</context-group>
</trans-unit>
<trans-unit id="1488347670280290838" datatype="html">
<source>Document count</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">40</context>
<context context-type="linenumber">39</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">40</context>
<context context-type="linenumber">39</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">40</context>
<context context-type="linenumber">39</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">40</context>
<context context-type="linenumber">39</context>
</context-group>
</trans-unit>
<trans-unit id="8095412801504464756" datatype="html">
<source>{VAR_PLURAL, plural, =1 {One <x id="INTERPOLATION"/>} other {<x id="INTERPOLATION_1"/> total <x id="INTERPOLATION_2"/>}}</source>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">119</context>
<context context-type="linenumber">118</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">119</context>
<context context-type="linenumber">118</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">119</context>
<context context-type="linenumber">118</context>
</context-group>
<context-group purpose="location">
<context context-type="sourcefile">src/app/components/manage/management-list/management-list.component.html</context>
<context context-type="linenumber">119</context>
<context context-type="linenumber">118</context>
</context-group>
</trans-unit>
<trans-unit id="810888510148304696" datatype="html">

View File

@@ -50,7 +50,7 @@
<div [ngbNavOutlet]="nav" class="border-start border-end border-bottom p-3 mb-3 shadow-sm"></div>
<div class="btn-toolbar" role="toolbar">
<div class="btn-group me-2">
<button type="button" (click)="discardChanges()" class="btn btn-outline-secondary" [disabled]="loading || (isDirty$ | async) === false" i18n>Discard</button>
<button type="button" (click)="discardChanges()" class="btn btn-secondary" [disabled]="loading || (isDirty$ | async) === false" i18n>Discard</button>
</div>
<div class="btn-group">
<button type="submit" class="btn btn-primary" [disabled]="loading || !configForm.valid || (isDirty$ | async) === false" i18n>Save</button>

View File

@@ -358,6 +358,6 @@
<div [ngbNavOutlet]="nav" class="border-start border-end border-bottom p-3 mb-3 shadow-sm"></div>
<button type="button" (click)="reset()" class="btn btn-outline-secondary mb-2" [disabled]="(isDirty$ | async) === false" i18n>Cancel</button>
<button type="submit" class="btn btn-primary ms-2 mb-2" [disabled]="(isDirty$ | async) === false" i18n>Save</button>
<button type="submit" class="btn btn-primary mb-2" [disabled]="(isDirty$ | async) === false" i18n>Save</button>
<button type="button" (click)="reset()" class="btn btn-secondary ms-2 mb-2" [disabled]="(isDirty$ | async) === false" i18n>Cancel</button>
</form>

View File

@@ -164,7 +164,7 @@ describe('ManagementListComponent', () => {
const toastInfoSpy = jest.spyOn(toastService, 'showInfo')
const reloadSpy = jest.spyOn(component, 'reloadData')
const createButton = fixture.debugElement.queryAll(By.css('button'))[4]
const createButton = fixture.debugElement.queryAll(By.css('button'))[3]
createButton.triggerEventHandler('click')
expect(modal).not.toBeUndefined()
@@ -188,7 +188,7 @@ describe('ManagementListComponent', () => {
const toastInfoSpy = jest.spyOn(toastService, 'showInfo')
const reloadSpy = jest.spyOn(component, 'reloadData')
const editButton = fixture.debugElement.queryAll(By.css('button'))[7]
const editButton = fixture.debugElement.queryAll(By.css('button'))[6]
editButton.triggerEventHandler('click')
expect(modal).not.toBeUndefined()
@@ -213,7 +213,7 @@ describe('ManagementListComponent', () => {
const deleteSpy = jest.spyOn(tagService, 'delete')
const reloadSpy = jest.spyOn(component, 'reloadData')
const deleteButton = fixture.debugElement.queryAll(By.css('button'))[8]
const deleteButton = fixture.debugElement.queryAll(By.css('button'))[7]
deleteButton.triggerEventHandler('click')
expect(modal).not.toBeUndefined()
@@ -233,7 +233,7 @@ describe('ManagementListComponent', () => {
it('should support quick filter for objects', () => {
const qfSpy = jest.spyOn(documentListViewService, 'quickFilter')
const filterButton = fixture.debugElement.queryAll(By.css('button'))[9]
const filterButton = fixture.debugElement.queryAll(By.css('button'))[8]
filterButton.triggerEventHandler('click')
expect(qfSpy).toHaveBeenCalledWith([
{ rule_type: FILTER_HAS_TAGS_ALL, value: tags[0].id.toString() },

View File

@@ -70,6 +70,6 @@
}
</ul>
<button type="button" (click)="reset()" class="btn btn-outline-secondary mb-2" [disabled]="(isDirty$ | async) === false" i18n>Cancel</button>
<button type="submit" class="btn btn-primary ms-2 mb-2" [disabled]="(isDirty$ | async) === false" i18n>Save</button>
<button type="submit" class="btn btn-primary mb-2" [disabled]="(isDirty$ | async) === false" i18n>Save</button>
<button type="button" (click)="reset()" class="btn btn-secondary ms-2 mb-2" [disabled]="(isDirty$ | async) === false" i18n>Cancel</button>
</form>

View File

@@ -1,23 +1,16 @@
from __future__ import annotations
import logging
import pickle
from binascii import hexlify
from collections import OrderedDict
from dataclasses import dataclass
from typing import TYPE_CHECKING
from typing import Any
from typing import Final
from django.conf import settings
from django.core.cache import cache
from django.core.cache import caches
from documents.models import Document
if TYPE_CHECKING:
from django.core.cache.backends.base import BaseCache
from documents.classifier import DocumentClassifier
logger = logging.getLogger("paperless.caching")
@@ -46,80 +39,6 @@ CACHE_1_MINUTE: Final[int] = 60
CACHE_5_MINUTES: Final[int] = 5 * CACHE_1_MINUTE
CACHE_50_MINUTES: Final[int] = 50 * CACHE_1_MINUTE
read_cache = caches["read-cache"]
class LRUCache:
def __init__(self, capacity: int = 128):
self._data = OrderedDict()
self.capacity = capacity
def get(self, key, default=None) -> Any | None:
if key in self._data:
self._data.move_to_end(key)
return self._data[key]
return default
def set(self, key, value) -> None:
self._data[key] = value
self._data.move_to_end(key)
while len(self._data) > self.capacity:
self._data.popitem(last=False)
class StoredLRUCache(LRUCache):
"""
LRU cache that can persist its entire contents as a single entry in a backend cache.
Useful for sharing a cache across multiple workers or processes.
Workflow:
1. Load the cache state from the backend using `load()`.
2. Use `get()` and `set()` locally as usual.
3. Persist changes back to the backend using `save()`.
"""
def __init__(
self,
backend_key: str,
capacity: int = 128,
backend: BaseCache = read_cache,
backend_ttl=settings.CACHALOT_TIMEOUT,
):
if backend_key is None:
raise ValueError("backend_key is mandatory")
super().__init__(capacity)
self._backend_key = backend_key
self._backend = backend
self.backend_ttl = backend_ttl
def load(self) -> None:
"""
Load the whole cache content from backend storage.
If no valid cached data exists in the backend, the local cache is cleared.
"""
serialized_data = self._backend.get(self._backend_key)
try:
self._data = (
pickle.loads(serialized_data) if serialized_data else OrderedDict()
)
except pickle.PickleError:
logger.warning(
"Cache exists in backend but could not be read (possibly invalid format)",
)
def save(self) -> None:
"""Save the entire local cache to the backend as a serialized object.
The backend entry will expire after the configured TTL.
"""
self._backend.set(
self._backend_key,
pickle.dumps(self._data),
self.backend_ttl,
)
def get_suggestion_cache_key(document_id: int) -> str:
"""

View File

@@ -16,29 +16,16 @@ if TYPE_CHECKING:
from django.conf import settings
from django.core.cache import cache
from django.core.cache import caches
from documents.caching import CACHE_5_MINUTES
from documents.caching import CACHE_50_MINUTES
from documents.caching import CLASSIFIER_HASH_KEY
from documents.caching import CLASSIFIER_MODIFIED_KEY
from documents.caching import CLASSIFIER_VERSION_KEY
from documents.caching import StoredLRUCache
from documents.models import Document
from documents.models import MatchingModel
logger = logging.getLogger("paperless.classifier")
ADVANCED_TEXT_PROCESSING_ENABLED = (
settings.NLTK_LANGUAGE is not None and settings.NLTK_ENABLED
)
read_cache = caches["read-cache"]
RE_DIGIT = re.compile(r"\d")
RE_WORD = re.compile(r"\b[\w]+\b") # words that may contain digits
class IncompatibleClassifierVersionError(Exception):
def __init__(self, message: str, *args: object) -> None:
@@ -105,27 +92,14 @@ class DocumentClassifier:
self.last_auto_type_hash: bytes | None = None
self.data_vectorizer = None
self.data_vectorizer_hash = None
self.tags_binarizer = None
self.tags_classifier = None
self.correspondent_classifier = None
self.document_type_classifier = None
self.storage_path_classifier = None
self._stemmer = None
# 10,000 elements roughly use 200 to 500 KB per worker,
# and also in the shared Redis cache,
# Keep this cache small to minimize lookup and I/O latency.
if ADVANCED_TEXT_PROCESSING_ENABLED:
self._stem_cache = StoredLRUCache(
f"stem_cache_v{self.FORMAT_VERSION}",
capacity=10000,
)
self._stop_words = None
def _update_data_vectorizer_hash(self):
self.data_vectorizer_hash = sha256(
pickle.dumps(self.data_vectorizer),
).hexdigest()
self._stemmer = None
self._stop_words = None
def load(self) -> None:
from sklearn.exceptions import InconsistentVersionWarning
@@ -145,7 +119,6 @@ class DocumentClassifier:
self.last_auto_type_hash = pickle.load(f)
self.data_vectorizer = pickle.load(f)
self._update_data_vectorizer_hash()
self.tags_binarizer = pickle.load(f)
self.tags_classifier = pickle.load(f)
@@ -296,7 +269,7 @@ class DocumentClassifier:
Generates the content for documents, but once at a time
"""
for doc in docs_queryset:
yield self.preprocess_content(doc.content, shared_cache=False)
yield self.preprocess_content(doc.content)
self.data_vectorizer = CountVectorizer(
analyzer="word",
@@ -374,7 +347,6 @@ class DocumentClassifier:
self.last_doc_change_time = latest_doc_change
self.last_auto_type_hash = hasher.digest()
self._update_data_vectorizer_hash()
# Set the classifier information into the cache
# Caching for 50 minutes, so slightly less than the normal retrain time
@@ -384,15 +356,30 @@ class DocumentClassifier:
return True
def _init_advanced_text_processing(self):
if self._stop_words is None or self._stemmer is None:
def preprocess_content(self, content: str) -> str: # pragma: no cover
"""
Process to contents of a document, distilling it down into
words which are meaningful to the content
"""
# Lower case the document
content = content.lower().strip()
# Reduce spaces
content = re.sub(r"\s+", " ", content)
# Get only the letters
content = re.sub(r"[^\w\s]", " ", content)
# If the NLTK language is supported, do further processing
if settings.NLTK_LANGUAGE is not None and settings.NLTK_ENABLED:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
# Not really hacky, since it isn't private and is documented, but
# set the search path for NLTK data to the single location it should be in
nltk.data.path = [settings.NLTK_DIR]
try:
# Preload the corpus early, to force the lazy loader to transform
stopwords.ensure_loaded()
@@ -400,100 +387,41 @@ class DocumentClassifier:
# Do some one time setup
# Sometimes, somehow, there's multiple threads loading the corpus
# and it's not thread safe, raising an AttributeError
self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE)
self._stop_words = frozenset(stopwords.words(settings.NLTK_LANGUAGE))
if self._stemmer is None:
self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE)
if self._stop_words is None:
self._stop_words = set(stopwords.words(settings.NLTK_LANGUAGE))
# Tokenize
# This splits the content into tokens, roughly words
words: list[str] = word_tokenize(
content,
language=settings.NLTK_LANGUAGE,
)
meaningful_words = []
for word in words:
# Skip stop words
# These are words like "a", "and", "the" which add little meaning
if word in self._stop_words:
continue
# Stem the words
# This reduces the words to their stems.
# "amazement" returns "amaz"
# "amaze" returns "amaz
# "amazed" returns "amaz"
meaningful_words.append(self._stemmer.stem(word))
return " ".join(meaningful_words)
except AttributeError:
logger.debug("Could not initialize NLTK for advanced text processing.")
return False
return True
def stem_and_skip_stop_words(self, words: list[str], *, shared_cache=True):
"""
Reduce a list of words to their stem. Stop words are converted to empty strings.
:param words: the list of words to stem
"""
def _stem_and_skip_stop_word(word: str):
"""
Reduce a given word to its stem. If it's a stop word, return an empty string.
E.g. "amazement", "amaze" and "amazed" all return "amaz".
"""
cached = self._stem_cache.get(word)
if cached is not None:
return cached
elif word in self._stop_words:
return ""
# Assumption: words that contain numbers are never stemmed
elif RE_DIGIT.search(word):
return word
else:
result = self._stemmer.stem(word)
self._stem_cache.set(word, result)
return result
if shared_cache:
self._stem_cache.load()
# Stem the words and skip stop words
result = " ".join(
filter(None, (_stem_and_skip_stop_word(w) for w in words)),
)
if shared_cache:
self._stem_cache.save()
return result
def preprocess_content(
self,
content: str,
*,
shared_cache=True,
) -> str:
"""
Process the contents of a document, distilling it down into
words which are meaningful to the content.
A stemmer cache is shared across workers with the parameter "shared_cache".
This is unnecessary when training the classifier.
"""
# Lower case the document, reduce space,
# and keep only letters and digits.
content = " ".join(match.group().lower() for match in RE_WORD.finditer(content))
if ADVANCED_TEXT_PROCESSING_ENABLED:
from nltk.tokenize import word_tokenize
if not self._init_advanced_text_processing():
return content
# Tokenize
# This splits the content into tokens, roughly words
words = word_tokenize(content, language=settings.NLTK_LANGUAGE)
# Stem the words and skip stop words
content = self.stem_and_skip_stop_words(words, shared_cache=shared_cache)
return content
def _get_vectorizer_cache_key(self, content: str):
hash = sha256(content.encode())
hash.update(
f"|{self.FORMAT_VERSION}|{settings.NLTK_LANGUAGE}|{settings.NLTK_ENABLED}|{self.data_vectorizer_hash}".encode(),
)
return f"vectorized_content_{hash.hexdigest()}"
def _vectorize(self, content: str):
key = self._get_vectorizer_cache_key(content)
serialized_result = read_cache.get(key)
if serialized_result is None:
result = self.data_vectorizer.transform([self.preprocess_content(content)])
read_cache.set(key, pickle.dumps(result), CACHE_5_MINUTES)
else:
read_cache.touch(key, CACHE_5_MINUTES)
result = pickle.loads(serialized_result)
return result
def predict_correspondent(self, content: str) -> int | None:
if self.correspondent_classifier:
X = self._vectorize(content)
X = self.data_vectorizer.transform([self.preprocess_content(content)])
correspondent_id = self.correspondent_classifier.predict(X)
if correspondent_id != -1:
return correspondent_id
@@ -504,7 +432,7 @@ class DocumentClassifier:
def predict_document_type(self, content: str) -> int | None:
if self.document_type_classifier:
X = self._vectorize(content)
X = self.data_vectorizer.transform([self.preprocess_content(content)])
document_type_id = self.document_type_classifier.predict(X)
if document_type_id != -1:
return document_type_id
@@ -517,7 +445,7 @@ class DocumentClassifier:
from sklearn.utils.multiclass import type_of_target
if self.tags_classifier:
X = self._vectorize(content)
X = self.data_vectorizer.transform([self.preprocess_content(content)])
y = self.tags_classifier.predict(X)
tags_ids = self.tags_binarizer.inverse_transform(y)[0]
if type_of_target(y).startswith("multilabel"):
@@ -536,7 +464,7 @@ class DocumentClassifier:
def predict_storage_path(self, content: str) -> int | None:
if self.storage_path_classifier:
X = self._vectorize(content)
X = self.data_vectorizer.transform([self.preprocess_content(content)])
storage_path_id = self.storage_path_classifier.predict(X)
if storage_path_id != -1:
return storage_path_id

View File

@@ -1,5 +1,4 @@
import os
from pathlib import Path
from django.conf import settings
@@ -8,15 +7,19 @@ from documents.templating.filepath import validate_filepath_template_and_render
from documents.templating.utils import convert_format_str_to_template_format
def create_source_path_directory(source_path: Path) -> None:
source_path.parent.mkdir(parents=True, exist_ok=True)
def create_source_path_directory(source_path):
os.makedirs(os.path.dirname(source_path), exist_ok=True)
def delete_empty_directories(directory: Path, root: Path) -> None:
if not directory.is_dir():
def delete_empty_directories(directory, root):
if not os.path.isdir(directory):
return
if not directory.is_relative_to(root):
# Go up in the directory hierarchy and try to delete all directories
directory = os.path.normpath(directory)
root = os.path.normpath(root)
if not directory.startswith(root + os.path.sep):
# don't do anything outside our originals folder.
# append os.path.set so that we avoid these cases:
@@ -24,12 +27,11 @@ def delete_empty_directories(directory: Path, root: Path) -> None:
# root = /home/originals ("/" gets appended and startswith fails)
return
# Go up in the directory hierarchy and try to delete all directories
while directory != root:
if not list(directory.iterdir()):
if not os.listdir(directory):
# it's empty
try:
directory.rmdir()
os.rmdir(directory)
except OSError:
# whatever. empty directories aren't that bad anyway.
return
@@ -38,10 +40,10 @@ def delete_empty_directories(directory: Path, root: Path) -> None:
return
# go one level up
directory = directory.parent
directory = os.path.normpath(os.path.dirname(directory))
def generate_unique_filename(doc, *, archive_filename=False) -> Path:
def generate_unique_filename(doc, *, archive_filename=False):
"""
Generates a unique filename for doc in settings.ORIGINALS_DIR.
@@ -54,32 +56,21 @@ def generate_unique_filename(doc, *, archive_filename=False) -> Path:
"""
if archive_filename:
old_filename: Path | None = (
Path(doc.archive_filename) if doc.archive_filename else None
)
old_filename = doc.archive_filename
root = settings.ARCHIVE_DIR
else:
old_filename = Path(doc.filename) if doc.filename else None
old_filename = doc.filename
root = settings.ORIGINALS_DIR
# If generating archive filenames, try to make a name that is similar to
# the original filename first.
if archive_filename and doc.filename:
# Generate the full path using the same logic as generate_filename
base_generated = generate_filename(doc, archive_filename=archive_filename)
# Try to create a simple PDF version based on the original filename
# but preserve any directory structure from the template
if str(base_generated.parent) != ".":
# Has directory structure, preserve it
simple_pdf_name = base_generated.parent / (Path(doc.filename).stem + ".pdf")
else:
# No directory structure
simple_pdf_name = Path(Path(doc.filename).stem + ".pdf")
if simple_pdf_name == old_filename or not (root / simple_pdf_name).exists():
return simple_pdf_name
new_filename = os.path.splitext(doc.filename)[0] + ".pdf"
if new_filename == old_filename or not os.path.exists(
os.path.join(root, new_filename),
):
return new_filename
counter = 0
@@ -93,7 +84,7 @@ def generate_unique_filename(doc, *, archive_filename=False) -> Path:
# still the same as before.
return new_filename
if (root / new_filename).exists():
if os.path.exists(os.path.join(root, new_filename)):
counter += 1
else:
return new_filename
@@ -105,8 +96,8 @@ def generate_filename(
counter=0,
append_gpg=True,
archive_filename=False,
) -> Path:
base_path: Path | None = None
):
path = ""
def format_filename(document: Document, template_str: str) -> str | None:
rendered_filename = validate_filepath_template_and_render(
@@ -143,34 +134,17 @@ def generate_filename(
# If we have one, render it
if filename_format is not None:
rendered_path: str | None = format_filename(doc, filename_format)
if rendered_path:
base_path = Path(rendered_path)
path = format_filename(doc, filename_format)
counter_str = f"_{counter:02}" if counter else ""
filetype_str = ".pdf" if archive_filename else doc.file_type
if base_path:
# Split the path into directory and filename parts
directory = base_path.parent
# Use the full name (not just stem) as the base filename
base_filename = base_path.name
# Build the final filename with counter and filetype
final_filename = f"{base_filename}{counter_str}{filetype_str}"
# If we have a directory component, include it
if str(directory) != ".":
full_path = directory / final_filename
else:
full_path = Path(final_filename)
if path:
filename = f"{path}{counter_str}{filetype_str}"
else:
# No template, use document ID
final_filename = f"{doc.pk:07}{counter_str}{filetype_str}"
full_path = Path(final_filename)
filename = f"{doc.pk:07}{counter_str}{filetype_str}"
# Add GPG extension if needed
if append_gpg and doc.storage_type == doc.STORAGE_TYPE_GPG:
full_path = full_path.with_suffix(full_path.suffix + ".gpg")
filename += ".gpg"
return full_path
return filename

View File

@@ -236,7 +236,10 @@ class Command(CryptMixin, BaseCommand):
# now make an archive in the original target, with all files stored
if self.zip_export and temp_dir is not None:
shutil.make_archive(
self.original_target / options["zip_name"],
os.path.join(
self.original_target,
options["zip_name"],
),
format="zip",
root_dir=temp_dir.name,
)
@@ -339,7 +342,7 @@ class Command(CryptMixin, BaseCommand):
)
if self.split_manifest:
manifest_name = base_name.with_name(f"{base_name.stem}-manifest.json")
manifest_name = Path(base_name + "-manifest.json")
if self.use_folder_prefix:
manifest_name = Path("json") / manifest_name
manifest_name = (self.target / manifest_name).resolve()
@@ -413,7 +416,7 @@ class Command(CryptMixin, BaseCommand):
else:
item.unlink()
def generate_base_name(self, document: Document) -> Path:
def generate_base_name(self, document: Document) -> str:
"""
Generates a unique name for the document, one which hasn't already been exported (or will be)
"""
@@ -433,12 +436,12 @@ class Command(CryptMixin, BaseCommand):
break
else:
filename_counter += 1
return Path(base_name)
return base_name
def generate_document_targets(
self,
document: Document,
base_name: Path,
base_name: str,
document_dict: dict,
) -> tuple[Path, Path | None, Path | None]:
"""
@@ -446,25 +449,25 @@ class Command(CryptMixin, BaseCommand):
"""
original_name = base_name
if self.use_folder_prefix:
original_name = Path("originals") / original_name
original_target = (self.target / original_name).resolve()
document_dict[EXPORTER_FILE_NAME] = str(original_name)
original_name = os.path.join("originals", original_name)
original_target = (self.target / Path(original_name)).resolve()
document_dict[EXPORTER_FILE_NAME] = original_name
if not self.no_thumbnail:
thumbnail_name = base_name.parent / (base_name.stem + "-thumbnail.webp")
thumbnail_name = base_name + "-thumbnail.webp"
if self.use_folder_prefix:
thumbnail_name = Path("thumbnails") / thumbnail_name
thumbnail_target = (self.target / thumbnail_name).resolve()
document_dict[EXPORTER_THUMBNAIL_NAME] = str(thumbnail_name)
thumbnail_name = os.path.join("thumbnails", thumbnail_name)
thumbnail_target = (self.target / Path(thumbnail_name)).resolve()
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
else:
thumbnail_target = None
if not self.no_archive and document.has_archive_version:
archive_name = base_name.parent / (base_name.stem + "-archive.pdf")
archive_name = base_name + "-archive.pdf"
if self.use_folder_prefix:
archive_name = Path("archive") / archive_name
archive_target = (self.target / archive_name).resolve()
document_dict[EXPORTER_ARCHIVE_NAME] = str(archive_name)
archive_name = os.path.join("archive", archive_name)
archive_target = (self.target / Path(archive_name)).resolve()
document_dict[EXPORTER_ARCHIVE_NAME] = archive_name
else:
archive_target = None
@@ -569,7 +572,7 @@ class Command(CryptMixin, BaseCommand):
perform_copy = False
if target.exists():
source_stat = source.stat()
source_stat = os.stat(source)
target_stat = target.stat()
if self.compare_checksums and source_checksum:
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()

View File

@@ -63,11 +63,11 @@ class Document:
/ "documents"
/ "originals"
/ f"{self.pk:07}.{self.file_type}.gpg"
)
).as_posix()
@property
def source_file(self):
return self.source_path.open("rb")
return Path(self.source_path).open("rb")
@property
def file_name(self):

View File

@@ -1,8 +1,8 @@
from __future__ import annotations
import logging
import os
import shutil
from pathlib import Path
from typing import TYPE_CHECKING
import httpx
@@ -51,6 +51,8 @@ from documents.permissions import set_permissions_for_object
from documents.templating.workflows import parse_w_workflow_placeholders
if TYPE_CHECKING:
from pathlib import Path
from documents.classifier import DocumentClassifier
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
@@ -327,16 +329,15 @@ def cleanup_document_deletion(sender, instance, **kwargs):
# Find a non-conflicting filename in case a document with the same
# name was moved to trash earlier
counter = 0
old_filename = Path(instance.source_path).name
old_filebase = Path(old_filename).stem
old_fileext = Path(old_filename).suffix
old_filename = os.path.split(instance.source_path)[1]
(old_filebase, old_fileext) = os.path.splitext(old_filename)
while True:
new_file_path = settings.EMPTY_TRASH_DIR / (
old_filebase + (f"_{counter:02}" if counter else "") + old_fileext
)
if new_file_path.exists():
if os.path.exists(new_file_path):
counter += 1
else:
break
@@ -360,26 +361,26 @@ def cleanup_document_deletion(sender, instance, **kwargs):
files += (instance.source_path,)
for filename in files:
if filename and filename.is_file():
if filename and os.path.isfile(filename):
try:
filename.unlink()
os.unlink(filename)
logger.debug(f"Deleted file {filename}.")
except OSError as e:
logger.warning(
f"While deleting document {instance!s}, the file "
f"{filename} could not be deleted: {e}",
)
elif filename and not filename.is_file():
elif filename and not os.path.isfile(filename):
logger.warning(f"Expected {filename} to exist, but it did not")
delete_empty_directories(
Path(instance.source_path).parent,
os.path.dirname(instance.source_path),
root=settings.ORIGINALS_DIR,
)
if instance.has_archive_version:
delete_empty_directories(
Path(instance.archive_path).parent,
os.path.dirname(instance.archive_path),
root=settings.ARCHIVE_DIR,
)
@@ -400,14 +401,14 @@ def update_filename_and_move_files(
if isinstance(instance, CustomFieldInstance):
instance = instance.document
def validate_move(instance, old_path: Path, new_path: Path):
if not old_path.is_file():
def validate_move(instance, old_path, new_path):
if not os.path.isfile(old_path):
# Can't do anything if the old file does not exist anymore.
msg = f"Document {instance!s}: File {old_path} doesn't exist."
logger.fatal(msg)
raise CannotMoveFilesException(msg)
if new_path.is_file():
if os.path.isfile(new_path):
# Can't do anything if the new file already exists. Skip updating file.
msg = f"Document {instance!s}: Cannot rename file since target path {new_path} already exists."
logger.warning(msg)
@@ -435,20 +436,16 @@ def update_filename_and_move_files(
old_filename = instance.filename
old_source_path = instance.source_path
# Need to convert to string to be able to save it to the db
instance.filename = str(generate_unique_filename(instance))
instance.filename = generate_unique_filename(instance)
move_original = old_filename != instance.filename
old_archive_filename = instance.archive_filename
old_archive_path = instance.archive_path
if instance.has_archive_version:
# Need to convert to string to be able to save it to the db
instance.archive_filename = str(
generate_unique_filename(
instance,
archive_filename=True,
),
instance.archive_filename = generate_unique_filename(
instance,
archive_filename=True,
)
move_archive = old_archive_filename != instance.archive_filename
@@ -490,11 +487,11 @@ def update_filename_and_move_files(
# Try to move files to their original location.
try:
if move_original and instance.source_path.is_file():
if move_original and os.path.isfile(instance.source_path):
logger.info("Restoring previous original path")
shutil.move(instance.source_path, old_source_path)
if move_archive and instance.archive_path.is_file():
if move_archive and os.path.isfile(instance.archive_path):
logger.info("Restoring previous archive path")
shutil.move(instance.archive_path, old_archive_path)
@@ -515,15 +512,17 @@ def update_filename_and_move_files(
# finally, remove any empty sub folders. This will do nothing if
# something has failed above.
if not old_source_path.is_file():
if not os.path.isfile(old_source_path):
delete_empty_directories(
Path(old_source_path).parent,
os.path.dirname(old_source_path),
root=settings.ORIGINALS_DIR,
)
if instance.has_archive_version and not old_archive_path.is_file():
if instance.has_archive_version and not os.path.isfile(
old_archive_path,
):
delete_empty_directories(
Path(old_archive_path).parent,
os.path.dirname(old_archive_path),
root=settings.ARCHIVE_DIR,
)
@@ -1220,7 +1219,10 @@ def run_workflows(
)
files = None
if action.webhook.include_document:
with original_file.open("rb") as f:
with open(
original_file,
"rb",
) as f:
files = {
"file": (
filename,

View File

@@ -1,34 +0,0 @@
Sample textual document content.
Include as many characters as possible, to check the classifier's vectorization.
Hey 00, this is "a" test0707 content.
This is an example document — created on 2025-06-25.
Digits: 0123456789
Punctuation: . , ; : ! ? ' " ( ) [ ] { } —
English text: The quick brown fox jumps over the lazy dog.
English stop words: Weve been doing it before.
Accented Latin (diacritics): àâäæçéèêëîïôœùûüÿñ
Arabic: لقد قام المترجم بعمل جيد
Greek: Αλφα, Βήτα, Γάμμα, Δέλτα, Ωμέγα
Cyrillic: Привет, как дела? Добро пожаловать!
Chinese (Simplified): 你好,世界!今天的天气很好。
Chinese (Traditional): 歡迎來到世界,今天天氣很好。
Japanese (Kanji, Hiragana, Katakana): 東京へ行きます。カタカナ、ひらがな、漢字。
Korean (Hangul): 안녕하세요. 오늘 날씨 어때요?
Arabic: مرحبًا، كيف حالك؟
Hebrew: שלום, מה שלומך?
Emoji: 😀 🐍 📘 ✅ ©️ 🇺🇳
Symbols: © ® ™ § ¶ † ‡ ∞ µ ∑ ∆ √
Math: ∫₀^∞ x² dx = ∞, π ≈ 3.14159, ∇·E = ρ/ε₀
Currency: 1$ € ¥ £ ₹
Date formats: 25/06/2025, June 25, 2025, 2025年6月25日
Quote in French: « Bonjour, ça va ? »
Quote in German: „Guten Tag! Wie geht's?“
Newline test:
\r\n
\r
Tab\ttest\tspacing
/ = +) ( []) ~ * #192 +33601010101 § ¤
End of document.

View File

@@ -1 +0,0 @@
sample textual document content include as many characters as possible to check the classifier s vectorization hey 00 this is a test0707 content this is an example document created on 2025 06 25 digits 0123456789 punctuation english text the quick brown fox jumps over the lazy dog english stop words we ve been doing it before accented latin diacritics àâäæçéèêëîïôœùûüÿñ arabic لقد قام المترجم بعمل جيد greek αλφα βήτα γάμμα δέλτα ωμέγα cyrillic привет как дела добро пожаловать chinese simplified 你好 世界 今天的天气很好 chinese traditional 歡迎來到世界 今天天氣很好 japanese kanji hiragana katakana 東京へ行きます カタカナ ひらがな 漢字 korean hangul 안녕하세요 오늘 날씨 어때요 arabic مرحب ا كيف حالك hebrew שלום מה שלומך emoji symbols µ math ₀ x² dx π 3 14159 e ρ ε₀ currency 1 date formats 25 06 2025 june 25 2025 2025年6月25日 quote in french bonjour ça va quote in german guten tag wie geht s newline test r n r tab ttest tspacing 192 33601010101 end of document

View File

@@ -1 +0,0 @@
sampl textual document content includ mani charact possibl check classifi vector hey 00 test0707 content exampl document creat 2025 06 25 digit 0123456789 punctuat english text quick brown fox jump lazi dog english stop word accent latin diacrit àâäæçéèêëîïôœùûüÿñ arab لقد قام المترجم بعمل جيد greek αλφα βήτα γάμμα δέλτα ωμέγα cyril привет как дела добро пожаловать chines simplifi 你好 世界 今天的天气很好 chines tradit 歡迎來到世界 今天天氣很好 japanes kanji hiragana katakana 東京へ行きます カタカナ ひらがな 漢字 korean hangul 안녕하세요 오늘 날씨 어때요 arab مرحب ا كيف حالك hebrew שלום מה שלומך emoji symbol µ math ₀ x² dx π 3 14159 e ρ ε₀ currenc 1 date format 25 06 2025 june 25 2025 2025年6月25日 quot french bonjour ça va quot german guten tag wie geht newlin test r n r tab ttest tspace 192 33601010101 end document

View File

@@ -1,45 +0,0 @@
import pickle
from documents.caching import StoredLRUCache
def test_lru_cache_entries():
CACHE_TTL = 1
# LRU cache with a capacity of 2 elements
cache = StoredLRUCache("test_lru_cache_key", 2, backend_ttl=CACHE_TTL)
cache.set(1, 1)
cache.set(2, 2)
assert cache.get(2) == 2
assert cache.get(1) == 1
# The oldest entry (2) should be removed
cache.set(3, 3)
assert cache.get(3) == 3
assert not cache.get(2)
assert cache.get(1) == 1
# Save the cache, restore it and check it overwrites the current cache in memory
cache.save()
cache.set(4, 4)
assert not cache.get(3)
cache.load()
assert not cache.get(4)
assert cache.get(3) == 3
assert cache.get(1) == 1
def test_stored_lru_cache_key_ttl(mocker):
mock_backend = mocker.Mock()
cache = StoredLRUCache("test_key", backend=mock_backend, backend_ttl=321)
# Simulate storing values
cache.set("x", "X")
cache.set("y", "Y")
cache.save()
# Assert backend.set was called with pickled data, key and TTL
mock_backend.set.assert_called_once()
key, data, timeout = mock_backend.set.call_args[0]
assert key == "test_key"
assert timeout == 321
assert pickle.loads(data) == {"x": "X", "y": "Y"}

View File

@@ -21,7 +21,7 @@ from documents.models import Tag
from documents.tests.utils import DirectoriesMixin
def dummy_preprocess(content: str, **kwargs):
def dummy_preprocess(content: str):
"""
Simpler, faster pre-processing for testing purposes
"""
@@ -223,47 +223,24 @@ class TestClassifier(DirectoriesMixin, TestCase):
self.generate_test_data()
self.classifier.train()
with (
mock.patch.object(
self.classifier.data_vectorizer,
"transform",
wraps=self.classifier.data_vectorizer.transform,
) as mock_transform,
mock.patch.object(
self.classifier,
"preprocess_content",
wraps=self.classifier.preprocess_content,
) as mock_preprocess_content,
):
self.assertEqual(
self.classifier.predict_correspondent(self.doc1.content),
self.c1.pk,
)
self.assertEqual(
self.classifier.predict_correspondent(self.doc2.content),
None,
)
self.assertListEqual(
self.classifier.predict_tags(self.doc1.content),
[self.t1.pk],
)
self.assertListEqual(
self.classifier.predict_tags(self.doc2.content),
[self.t1.pk, self.t3.pk],
)
self.assertEqual(
self.classifier.predict_document_type(self.doc1.content),
self.dt.pk,
)
self.assertEqual(
self.classifier.predict_document_type(self.doc2.content),
None,
)
# Check that the classifier vectorized content and text preprocessing has been cached
# It should be called once per document (doc1 and doc2)
self.assertEqual(mock_preprocess_content.call_count, 2)
self.assertEqual(mock_transform.call_count, 2)
self.assertEqual(
self.classifier.predict_correspondent(self.doc1.content),
self.c1.pk,
)
self.assertEqual(self.classifier.predict_correspondent(self.doc2.content), None)
self.assertListEqual(
self.classifier.predict_tags(self.doc1.content),
[self.t1.pk],
)
self.assertListEqual(
self.classifier.predict_tags(self.doc2.content),
[self.t1.pk, self.t3.pk],
)
self.assertEqual(
self.classifier.predict_document_type(self.doc1.content),
self.dt.pk,
)
self.assertEqual(self.classifier.predict_document_type(self.doc2.content), None)
def test_no_retrain_if_no_change(self):
"""
@@ -717,67 +694,3 @@ class TestClassifier(DirectoriesMixin, TestCase):
mock_load.side_effect = Exception()
with self.assertRaises(Exception):
load_classifier(raise_exception=True)
def test_preprocess_content():
"""
GIVEN:
- Advanced text processing is enabled (default)
WHEN:
- Classifier preprocesses a document's content
THEN:
- Processed content matches the expected output (stemmed words)
"""
with (Path(__file__).parent / "samples" / "content.txt").open("r") as f:
content = f.read()
with (Path(__file__).parent / "samples" / "preprocessed_content_advanced.txt").open(
"r",
) as f:
expected_preprocess_content = f.read().rstrip()
classifier = DocumentClassifier()
result = classifier.preprocess_content(content)
assert result == expected_preprocess_content
def test_preprocess_content_nltk_disabled():
"""
GIVEN:
- Advanced text processing is disabled
WHEN:
- Classifier preprocesses a document's content
THEN:
- Processed content matches the expected output (unstemmed words)
"""
with (Path(__file__).parent / "samples" / "content.txt").open("r") as f:
content = f.read()
with (Path(__file__).parent / "samples" / "preprocessed_content.txt").open(
"r",
) as f:
expected_preprocess_content = f.read().rstrip()
classifier = DocumentClassifier()
with mock.patch("documents.classifier.ADVANCED_TEXT_PROCESSING_ENABLED", new=False):
result = classifier.preprocess_content(content)
assert result == expected_preprocess_content
def test_preprocess_content_nltk_load_fail(mocker):
"""
GIVEN:
- NLTK stop words fail to load
WHEN:
- Classifier preprocesses a document's content
THEN:
- Processed content matches the expected output (unstemmed words)
"""
_module = mocker.MagicMock(name="nltk_corpus_mock")
_module.stopwords.words.side_effect = AttributeError()
mocker.patch.dict("sys.modules", {"nltk.corpus": _module})
classifier = DocumentClassifier()
with (Path(__file__).parent / "samples" / "content.txt").open("r") as f:
content = f.read()
with (Path(__file__).parent / "samples" / "preprocessed_content.txt").open(
"r",
) as f:
expected_preprocess_content = f.read().rstrip()
result = classifier.preprocess_content(content)
assert result == expected_preprocess_content

View File

@@ -41,9 +41,11 @@ class TestDocument(TestCase):
Path(file_path).touch()
Path(thumb_path).touch()
with mock.patch("documents.signals.handlers.Path.unlink") as mock_unlink:
with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink:
document.delete()
empty_trash([document.pk])
mock_unlink.assert_any_call(file_path)
mock_unlink.assert_any_call(thumb_path)
self.assertEqual(mock_unlink.call_count, 2)
def test_document_soft_delete(self):
@@ -61,7 +63,7 @@ class TestDocument(TestCase):
Path(file_path).touch()
Path(thumb_path).touch()
with mock.patch("documents.signals.handlers.Path.unlink") as mock_unlink:
with mock.patch("documents.signals.handlers.os.unlink") as mock_unlink:
document.delete()
self.assertEqual(mock_unlink.call_count, 0)

View File

@@ -34,12 +34,12 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
document.save()
self.assertEqual(generate_filename(document), Path(f"{document.pk:07d}.pdf"))
self.assertEqual(generate_filename(document), f"{document.pk:07d}.pdf")
document.storage_type = Document.STORAGE_TYPE_GPG
self.assertEqual(
generate_filename(document),
Path(f"{document.pk:07d}.pdf.gpg"),
f"{document.pk:07d}.pdf.gpg",
)
@override_settings(FILENAME_FORMAT="{correspondent}/{correspondent}")
@@ -58,12 +58,12 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
document.filename = generate_filename(document)
# Ensure that filename is properly generated
self.assertEqual(document.filename, Path("none/none.pdf"))
self.assertEqual(document.filename, "none/none.pdf")
# Enable encryption and check again
document.storage_type = Document.STORAGE_TYPE_GPG
document.filename = generate_filename(document)
self.assertEqual(document.filename, Path("none/none.pdf.gpg"))
self.assertEqual(document.filename, "none/none.pdf.gpg")
document.save()
@@ -96,7 +96,7 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
# Ensure that filename is properly generated
document.filename = generate_filename(document)
self.assertEqual(document.filename, Path("none/none.pdf"))
self.assertEqual(document.filename, "none/none.pdf")
create_source_path_directory(document.source_path)
document.source_path.touch()
@@ -137,7 +137,7 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
# Ensure that filename is properly generated
document.filename = generate_filename(document)
self.assertEqual(document.filename, Path("none/none.pdf"))
self.assertEqual(document.filename, "none/none.pdf")
create_source_path_directory(document.source_path)
Path(document.source_path).touch()
@@ -247,7 +247,7 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
# Ensure that filename is properly generated
document.filename = generate_filename(document)
self.assertEqual(document.filename, Path("none/none.pdf"))
self.assertEqual(document.filename, "none/none.pdf")
create_source_path_directory(document.source_path)
@@ -269,11 +269,11 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
dt = DocumentType.objects.create(name="my_doc_type")
d = Document.objects.create(title="the_doc", mime_type="application/pdf")
self.assertEqual(generate_filename(d), Path("none - the_doc.pdf"))
self.assertEqual(generate_filename(d), "none - the_doc.pdf")
d.document_type = dt
self.assertEqual(generate_filename(d), Path("my_doc_type - the_doc.pdf"))
self.assertEqual(generate_filename(d), "my_doc_type - the_doc.pdf")
@override_settings(FILENAME_FORMAT="{asn} - {title}")
def test_asn(self):
@@ -289,8 +289,8 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
archive_serial_number=None,
checksum="B",
)
self.assertEqual(generate_filename(d1), Path("652 - the_doc.pdf"))
self.assertEqual(generate_filename(d2), Path("none - the_doc.pdf"))
self.assertEqual(generate_filename(d1), "652 - the_doc.pdf")
self.assertEqual(generate_filename(d2), "none - the_doc.pdf")
@override_settings(FILENAME_FORMAT="{title} {tag_list}")
def test_tag_list(self):
@@ -298,7 +298,7 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
doc.tags.create(name="tag2")
doc.tags.create(name="tag1")
self.assertEqual(generate_filename(doc), Path("doc1 tag1,tag2.pdf"))
self.assertEqual(generate_filename(doc), "doc1 tag1,tag2.pdf")
doc = Document.objects.create(
title="doc2",
@@ -306,7 +306,7 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
mime_type="application/pdf",
)
self.assertEqual(generate_filename(doc), Path("doc2.pdf"))
self.assertEqual(generate_filename(doc), "doc2.pdf")
@override_settings(FILENAME_FORMAT="//etc/something/{title}")
def test_filename_relative(self):
@@ -330,11 +330,11 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
created=d1,
)
self.assertEqual(generate_filename(doc1), Path("2020-03-06.pdf"))
self.assertEqual(generate_filename(doc1), "2020-03-06.pdf")
doc1.created = datetime.date(2020, 11, 16)
self.assertEqual(generate_filename(doc1), Path("2020-11-16.pdf"))
self.assertEqual(generate_filename(doc1), "2020-11-16.pdf")
@override_settings(
FILENAME_FORMAT="{added_year}-{added_month}-{added_day}",
@@ -347,11 +347,11 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
added=d1,
)
self.assertEqual(generate_filename(doc1), Path("232-01-09.pdf"))
self.assertEqual(generate_filename(doc1), "232-01-09.pdf")
doc1.added = timezone.make_aware(datetime.datetime(2020, 11, 16, 1, 1, 1))
self.assertEqual(generate_filename(doc1), Path("2020-11-16.pdf"))
self.assertEqual(generate_filename(doc1), "2020-11-16.pdf")
@override_settings(
FILENAME_FORMAT="{correspondent}/{correspondent}/{correspondent}",
@@ -389,11 +389,11 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
self.assertEqual(generate_filename(document), Path("0000001.pdf"))
self.assertEqual(generate_filename(document), "0000001.pdf")
document.pk = 13579
self.assertEqual(generate_filename(document), Path("0013579.pdf"))
self.assertEqual(generate_filename(document), "0013579.pdf")
@override_settings(FILENAME_FORMAT=None)
def test_format_none(self):
@@ -402,7 +402,7 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
self.assertEqual(generate_filename(document), Path("0000001.pdf"))
self.assertEqual(generate_filename(document), "0000001.pdf")
def test_try_delete_empty_directories(self):
# Create our working directory
@@ -428,7 +428,7 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
self.assertEqual(generate_filename(document), Path("0000001.pdf"))
self.assertEqual(generate_filename(document), "0000001.pdf")
@override_settings(FILENAME_FORMAT="{created__year}")
def test_invalid_format_key(self):
@@ -437,7 +437,7 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
self.assertEqual(generate_filename(document), Path("0000001.pdf"))
self.assertEqual(generate_filename(document), "0000001.pdf")
@override_settings(FILENAME_FORMAT="{title}")
def test_duplicates(self):
@@ -564,7 +564,7 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
value_select="abc123",
)
self.assertEqual(generate_filename(doc), Path("document_apple.pdf"))
self.assertEqual(generate_filename(doc), "document_apple.pdf")
# handler should not have been called
self.assertEqual(m.call_count, 0)
@@ -576,7 +576,7 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
],
}
cf.save()
self.assertEqual(generate_filename(doc), Path("document_aubergine.pdf"))
self.assertEqual(generate_filename(doc), "document_aubergine.pdf")
# handler should have been called
self.assertEqual(m.call_count, 1)
@@ -897,7 +897,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
pk=1,
checksum="1",
)
self.assertEqual(generate_filename(doc), Path("This. is the title.pdf"))
self.assertEqual(generate_filename(doc), "This. is the title.pdf")
doc = Document.objects.create(
title="my\\invalid/../title:yay",
@@ -905,7 +905,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
pk=2,
checksum="2",
)
self.assertEqual(generate_filename(doc), Path("my-invalid-..-title-yay.pdf"))
self.assertEqual(generate_filename(doc), "my-invalid-..-title-yay.pdf")
@override_settings(FILENAME_FORMAT="{created}")
def test_date(self):
@@ -916,7 +916,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
pk=2,
checksum="2",
)
self.assertEqual(generate_filename(doc), Path("2020-05-21.pdf"))
self.assertEqual(generate_filename(doc), "2020-05-21.pdf")
def test_dynamic_path(self):
"""
@@ -935,7 +935,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
checksum="2",
storage_path=StoragePath.objects.create(path="TestFolder/{{created}}"),
)
self.assertEqual(generate_filename(doc), Path("TestFolder/2020-06-25.pdf"))
self.assertEqual(generate_filename(doc), "TestFolder/2020-06-25.pdf")
def test_dynamic_path_with_none(self):
"""
@@ -956,7 +956,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
checksum="2",
storage_path=StoragePath.objects.create(path="{{asn}} - {{created}}"),
)
self.assertEqual(generate_filename(doc), Path("none - 2020-06-25.pdf"))
self.assertEqual(generate_filename(doc), "none - 2020-06-25.pdf")
@override_settings(
FILENAME_FORMAT_REMOVE_NONE=True,
@@ -984,7 +984,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
checksum="2",
storage_path=sp,
)
self.assertEqual(generate_filename(doc), Path("TestFolder/2020-06-25.pdf"))
self.assertEqual(generate_filename(doc), "TestFolder/2020-06-25.pdf")
# Special case, undefined variable, then defined at the start of the template
# This could lead to an absolute path after we remove the leading -none-, but leave the leading /
@@ -993,7 +993,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
"{{ owner_username }}/{{ created_year }}/{{ correspondent }}/{{ title }}"
)
sp.save()
self.assertEqual(generate_filename(doc), Path("2020/does not matter.pdf"))
self.assertEqual(generate_filename(doc), "2020/does not matter.pdf")
def test_multiple_doc_paths(self):
"""
@@ -1028,14 +1028,8 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
),
)
self.assertEqual(
generate_filename(doc_a),
Path("ThisIsAFolder/4/2020-06-25.pdf"),
)
self.assertEqual(
generate_filename(doc_b),
Path("SomeImportantNone/2020-07-25.pdf"),
)
self.assertEqual(generate_filename(doc_a), "ThisIsAFolder/4/2020-06-25.pdf")
self.assertEqual(generate_filename(doc_b), "SomeImportantNone/2020-07-25.pdf")
@override_settings(
FILENAME_FORMAT=None,
@@ -1070,11 +1064,8 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
),
)
self.assertEqual(generate_filename(doc_a), Path("0000002.pdf"))
self.assertEqual(
generate_filename(doc_b),
Path("SomeImportantNone/2020-07-25.pdf"),
)
self.assertEqual(generate_filename(doc_a), "0000002.pdf")
self.assertEqual(generate_filename(doc_b), "SomeImportantNone/2020-07-25.pdf")
@override_settings(
FILENAME_FORMAT="{created_year_short}/{created_month_name_short}/{created_month_name}/{title}",
@@ -1087,7 +1078,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
pk=2,
checksum="2",
)
self.assertEqual(generate_filename(doc), Path("89/Dec/December/The Title.pdf"))
self.assertEqual(generate_filename(doc), "89/Dec/December/The Title.pdf")
@override_settings(
FILENAME_FORMAT="{added_year_short}/{added_month_name}/{added_month_name_short}/{title}",
@@ -1100,7 +1091,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
pk=2,
checksum="2",
)
self.assertEqual(generate_filename(doc), Path("84/August/Aug/The Title.pdf"))
self.assertEqual(generate_filename(doc), "84/August/Aug/The Title.pdf")
@override_settings(
FILENAME_FORMAT="{owner_username}/{title}",
@@ -1133,8 +1124,8 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
checksum="3",
)
self.assertEqual(generate_filename(owned_doc), Path("user1/The Title.pdf"))
self.assertEqual(generate_filename(no_owner_doc), Path("none/does matter.pdf"))
self.assertEqual(generate_filename(owned_doc), "user1/The Title.pdf")
self.assertEqual(generate_filename(no_owner_doc), "none/does matter.pdf")
@override_settings(
FILENAME_FORMAT="{original_name}",
@@ -1180,20 +1171,17 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
original_filename="logs.txt",
)
self.assertEqual(generate_filename(doc_with_original), Path("someepdf.pdf"))
self.assertEqual(generate_filename(doc_with_original), "someepdf.pdf")
self.assertEqual(
generate_filename(tricky_with_original),
Path("some pdf with spaces and stuff.pdf"),
"some pdf with spaces and stuff.pdf",
)
self.assertEqual(generate_filename(no_original), Path("none.pdf"))
self.assertEqual(generate_filename(no_original), "none.pdf")
self.assertEqual(generate_filename(text_doc), Path("logs.txt"))
self.assertEqual(
generate_filename(text_doc, archive_filename=True),
Path("logs.pdf"),
)
self.assertEqual(generate_filename(text_doc), "logs.txt")
self.assertEqual(generate_filename(text_doc, archive_filename=True), "logs.pdf")
@override_settings(
FILENAME_FORMAT="XX{correspondent}/{title}",
@@ -1218,7 +1206,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
# Ensure that filename is properly generated
document.filename = generate_filename(document)
self.assertEqual(document.filename, Path("XX/doc1.pdf"))
self.assertEqual(document.filename, "XX/doc1.pdf")
def test_complex_template_strings(self):
"""
@@ -1256,19 +1244,19 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
self.assertEqual(
generate_filename(doc_a),
Path("somepath/some where/2020-06-25/Does Matter.pdf"),
"somepath/some where/2020-06-25/Does Matter.pdf",
)
doc_a.checksum = "5"
self.assertEqual(
generate_filename(doc_a),
Path("somepath/2024-10-01/Does Matter.pdf"),
"somepath/2024-10-01/Does Matter.pdf",
)
sp.path = "{{ document.title|lower }}{{ document.archive_serial_number - 2 }}"
sp.save()
self.assertEqual(generate_filename(doc_a), Path("does matter23.pdf"))
self.assertEqual(generate_filename(doc_a), "does matter23.pdf")
sp.path = """
somepath/
@@ -1287,13 +1275,13 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
sp.save()
self.assertEqual(
generate_filename(doc_a),
Path("somepath/asn-000-200/Does Matter/Does Matter.pdf"),
"somepath/asn-000-200/Does Matter/Does Matter.pdf",
)
doc_a.archive_serial_number = 301
doc_a.save()
self.assertEqual(
generate_filename(doc_a),
Path("somepath/asn-201-400/asn-3xx/Does Matter.pdf"),
"somepath/asn-201-400/asn-3xx/Does Matter.pdf",
)
@override_settings(
@@ -1322,7 +1310,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
with self.assertLogs(level=logging.WARNING) as capture:
self.assertEqual(
generate_filename(doc_a),
Path("0000002.pdf"),
"0000002.pdf",
)
self.assertEqual(len(capture.output), 1)
@@ -1357,7 +1345,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
with self.assertLogs(level=logging.WARNING) as capture:
self.assertEqual(
generate_filename(doc_a),
Path("0000002.pdf"),
"0000002.pdf",
)
self.assertEqual(len(capture.output), 1)
@@ -1425,7 +1413,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
):
self.assertEqual(
generate_filename(doc_a),
Path("invoices/1234.pdf"),
"invoices/1234.pdf",
)
with override_settings(
@@ -1439,7 +1427,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
):
self.assertEqual(
generate_filename(doc_a),
Path("Some Title_ChoiceOne.pdf"),
"Some Title_ChoiceOne.pdf",
)
# Check for handling Nones well
@@ -1448,7 +1436,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
self.assertEqual(
generate_filename(doc_a),
Path("Some Title_Default Value.pdf"),
"Some Title_Default Value.pdf",
)
cf.name = "Invoice Number"
@@ -1461,7 +1449,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
):
self.assertEqual(
generate_filename(doc_a),
Path("invoices/4567.pdf"),
"invoices/4567.pdf",
)
with override_settings(
@@ -1469,7 +1457,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
):
self.assertEqual(
generate_filename(doc_a),
Path("invoices/0.pdf"),
"invoices/0.pdf",
)
def test_datetime_filter(self):
@@ -1508,7 +1496,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
):
self.assertEqual(
generate_filename(doc_a),
Path("2020/Some Title.pdf"),
"2020/Some Title.pdf",
)
with override_settings(
@@ -1516,7 +1504,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
):
self.assertEqual(
generate_filename(doc_a),
Path("2020-06-25/Some Title.pdf"),
"2020-06-25/Some Title.pdf",
)
with override_settings(
@@ -1524,7 +1512,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
):
self.assertEqual(
generate_filename(doc_a),
Path("2024-10-01/Some Title.pdf"),
"2024-10-01/Some Title.pdf",
)
def test_slugify_filter(self):
@@ -1551,7 +1539,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
):
self.assertEqual(
generate_filename(doc),
Path("some-title-with-special-characters.pdf"),
"some-title-with-special-characters.pdf",
)
# Test with correspondent name containing spaces and special chars
@@ -1565,7 +1553,7 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
):
self.assertEqual(
generate_filename(doc),
Path("johns-office-workplace/some-title-with-special-characters.pdf"),
"johns-office-workplace/some-title-with-special-characters.pdf",
)
# Test with custom fields
@@ -1584,5 +1572,5 @@ class TestFilenameGeneration(DirectoriesMixin, TestCase):
):
self.assertEqual(
generate_filename(doc),
Path("brussels-belgium/some-title-with-special-characters.pdf"),
"brussels-belgium/some-title-with-special-characters.pdf",
)

View File

@@ -209,7 +209,7 @@ class TestExportImport(
4,
)
self.assertIsFile(self.target / "manifest.json")
self.assertIsFile((self.target / "manifest.json").as_posix())
self.assertEqual(
self._get_document_from_manifest(manifest, self.d1.id)["fields"]["title"],
@@ -235,7 +235,9 @@ class TestExportImport(
).as_posix()
self.assertIsFile(fname)
self.assertIsFile(
self.target / element[document_exporter.EXPORTER_THUMBNAIL_NAME],
(
self.target / element[document_exporter.EXPORTER_THUMBNAIL_NAME]
).as_posix(),
)
with Path(fname).open("rb") as f:
@@ -250,7 +252,7 @@ class TestExportImport(
if document_exporter.EXPORTER_ARCHIVE_NAME in element:
fname = (
self.target / element[document_exporter.EXPORTER_ARCHIVE_NAME]
)
).as_posix()
self.assertIsFile(fname)
with Path(fname).open("rb") as f:
@@ -310,7 +312,7 @@ class TestExportImport(
)
self._do_export()
self.assertIsFile(self.target / "manifest.json")
self.assertIsFile((self.target / "manifest.json").as_posix())
st_mtime_1 = (self.target / "manifest.json").stat().st_mtime
@@ -320,7 +322,7 @@ class TestExportImport(
self._do_export()
m.assert_not_called()
self.assertIsFile(self.target / "manifest.json")
self.assertIsFile((self.target / "manifest.json").as_posix())
st_mtime_2 = (self.target / "manifest.json").stat().st_mtime
Path(self.d1.source_path).touch()
@@ -332,7 +334,7 @@ class TestExportImport(
self.assertEqual(m.call_count, 1)
st_mtime_3 = (self.target / "manifest.json").stat().st_mtime
self.assertIsFile(self.target / "manifest.json")
self.assertIsFile((self.target / "manifest.json").as_posix())
self.assertNotEqual(st_mtime_1, st_mtime_2)
self.assertNotEqual(st_mtime_2, st_mtime_3)
@@ -350,7 +352,7 @@ class TestExportImport(
self._do_export()
self.assertIsFile(self.target / "manifest.json")
self.assertIsFile((self.target / "manifest.json").as_posix())
with mock.patch(
"documents.management.commands.document_exporter.copy_file_with_basic_stats",
@@ -358,7 +360,7 @@ class TestExportImport(
self._do_export()
m.assert_not_called()
self.assertIsFile(self.target / "manifest.json")
self.assertIsFile((self.target / "manifest.json").as_posix())
self.d2.checksum = "asdfasdgf3"
self.d2.save()
@@ -369,7 +371,7 @@ class TestExportImport(
self._do_export(compare_checksums=True)
self.assertEqual(m.call_count, 1)
self.assertIsFile(self.target / "manifest.json")
self.assertIsFile((self.target / "manifest.json").as_posix())
def test_update_export_deleted_document(self):
shutil.rmtree(Path(self.dirs.media_dir) / "documents")
@@ -383,7 +385,7 @@ class TestExportImport(
self.assertTrue(len(manifest), 7)
doc_from_manifest = self._get_document_from_manifest(manifest, self.d3.id)
self.assertIsFile(
str(self.target / doc_from_manifest[EXPORTER_FILE_NAME]),
(self.target / doc_from_manifest[EXPORTER_FILE_NAME]).as_posix(),
)
self.d3.delete()
@@ -395,12 +397,12 @@ class TestExportImport(
self.d3.id,
)
self.assertIsFile(
self.target / doc_from_manifest[EXPORTER_FILE_NAME],
(self.target / doc_from_manifest[EXPORTER_FILE_NAME]).as_posix(),
)
manifest = self._do_export(delete=True)
self.assertIsNotFile(
self.target / doc_from_manifest[EXPORTER_FILE_NAME],
(self.target / doc_from_manifest[EXPORTER_FILE_NAME]).as_posix(),
)
self.assertTrue(len(manifest), 6)
@@ -414,20 +416,20 @@ class TestExportImport(
)
self._do_export(use_filename_format=True)
self.assertIsFile(self.target / "wow1" / "c.pdf")
self.assertIsFile((self.target / "wow1" / "c.pdf").as_posix())
self.assertIsFile(self.target / "manifest.json")
self.assertIsFile((self.target / "manifest.json").as_posix())
self.d1.title = "new_title"
self.d1.save()
self._do_export(use_filename_format=True, delete=True)
self.assertIsNotFile(self.target / "wow1" / "c.pdf")
self.assertIsNotDir(self.target / "wow1")
self.assertIsFile(self.target / "new_title" / "c.pdf")
self.assertIsFile(self.target / "manifest.json")
self.assertIsFile(self.target / "wow2" / "none.pdf")
self.assertIsNotFile((self.target / "wow1" / "c.pdf").as_posix())
self.assertIsNotDir((self.target / "wow1").as_posix())
self.assertIsFile((self.target / "new_title" / "c.pdf").as_posix())
self.assertIsFile((self.target / "manifest.json").as_posix())
self.assertIsFile((self.target / "wow2" / "none.pdf").as_posix())
self.assertIsFile(
self.target / "wow2" / "none_01.pdf",
(self.target / "wow2" / "none_01.pdf").as_posix(),
)
def test_export_missing_files(self):

View File

@@ -20,7 +20,7 @@ def source_path_before(self):
if self.storage_type == STORAGE_TYPE_GPG:
fname += ".gpg"
return Path(settings.ORIGINALS_DIR) / fname
return (Path(settings.ORIGINALS_DIR) / fname).as_posix()
def file_type_after(self):
@@ -35,7 +35,7 @@ def source_path_after(doc):
if doc.storage_type == STORAGE_TYPE_GPG:
fname += ".gpg" # pragma: no cover
return Path(settings.ORIGINALS_DIR) / fname
return (Path(settings.ORIGINALS_DIR) / fname).as_posix()
@override_settings(PASSPHRASE="test")

View File

@@ -324,7 +324,6 @@ INSTALLED_APPS = [
"paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"paperless_mail.apps.PaperlessMailConfig",
"paperless_remote.apps.PaperlessRemoteParserConfig",
"django.contrib.admin",
"rest_framework",
"rest_framework.authtoken",
@@ -1422,11 +1421,3 @@ OUTLOOK_OAUTH_ENABLED = bool(
and OUTLOOK_OAUTH_CLIENT_ID
and OUTLOOK_OAUTH_CLIENT_SECRET,
)
###############################################################################
# Remote Parser #
###############################################################################
REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")

View File

@@ -1,4 +0,0 @@
# this is here so that django finds the checks.
from paperless_remote.checks import check_remote_parser_configured
__all__ = ["check_remote_parser_configured"]

View File

@@ -1,14 +0,0 @@
from django.apps import AppConfig
from paperless_remote.signals import remote_consumer_declaration
class PaperlessRemoteParserConfig(AppConfig):
name = "paperless_remote"
def ready(self):
from documents.signals import document_consumer_declaration
document_consumer_declaration.connect(remote_consumer_declaration)
AppConfig.ready(self)

View File

@@ -1,15 +0,0 @@
from django.conf import settings
from django.core.checks import Error
from django.core.checks import register
@register()
def check_remote_parser_configured(app_configs, **kwargs):
if settings.REMOTE_OCR_ENGINE == "azureai" and not settings.REMOTE_OCR_ENDPOINT:
return [
Error(
"Azure AI remote parser requires endpoint to be configured.",
),
]
return []

View File

@@ -1,113 +0,0 @@
from pathlib import Path
from django.conf import settings
from paperless_tesseract.parsers import RasterisedDocumentParser
class RemoteEngineConfig:
def __init__(
self,
engine: str,
api_key: str | None = None,
endpoint: str | None = None,
):
self.engine = engine
self.api_key = api_key
self.endpoint = endpoint
def engine_is_valid(self):
valid = self.engine in ["azureai"] and self.api_key is not None
if self.engine == "azureai":
valid = valid and self.endpoint is not None
return valid
class RemoteDocumentParser(RasterisedDocumentParser):
"""
This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
as this is the only service that provides a remote OCR API with text-embedded PDF output.
"""
logging_name = "paperless.parsing.remote"
def get_settings(self) -> RemoteEngineConfig:
"""
Returns the configuration for the remote OCR engine, loaded from Django settings.
"""
return RemoteEngineConfig(
engine=settings.REMOTE_OCR_ENGINE,
api_key=settings.REMOTE_OCR_API_KEY,
endpoint=settings.REMOTE_OCR_ENDPOINT,
)
def supported_mime_types(self):
if self.settings.engine_is_valid():
return [
"application/pdf",
"image/png",
"image/jpeg",
"image/tiff",
"image/bmp",
"image/gif",
"image/webp",
]
else:
return []
def azure_ai_vision_parse(
self,
file: Path,
) -> str | None:
"""
Uses Azure AI Vision to parse the document and return the text content.
It requests a searchable PDF output with embedded text.
The PDF is saved to the archive_path attribute.
Returns the text content extracted from the document.
If the parsing fails, it returns None.
"""
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from azure.ai.documentintelligence.models import AnalyzeOutputOption
from azure.ai.documentintelligence.models import DocumentContentFormat
from azure.core.credentials import AzureKeyCredential
client = DocumentIntelligenceClient(
endpoint=self.settings.endpoint,
credential=AzureKeyCredential(self.settings.api_key),
)
with file.open("rb") as f:
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
poller = client.begin_analyze_document(
model_id="prebuilt-read",
body=analyze_request,
output_content_format=DocumentContentFormat.TEXT,
output=[AnalyzeOutputOption.PDF], # request searchable PDF output
content_type="application/json",
)
poller.wait()
result_id = poller.details["operation_id"]
result = poller.result()
# Download the PDF with embedded text
self.archive_path = Path(self.tempdir) / "archive.pdf"
with self.archive_path.open("wb") as f:
for chunk in client.get_analyze_result_pdf(
model_id="prebuilt-read",
result_id=result_id,
):
f.write(chunk)
return result.content
def parse(self, document_path: Path, mime_type, file_name=None):
if not self.settings.engine_is_valid():
self.log.warning(
"No valid remote parser engine is configured, content will be empty.",
)
self.text = ""
return
elif self.settings.engine == "azureai":
self.text = self.azure_ai_vision_parse(document_path)

View File

@@ -1,18 +0,0 @@
def get_parser(*args, **kwargs):
from paperless_remote.parsers import RemoteDocumentParser
return RemoteDocumentParser(*args, **kwargs)
def get_supported_mime_types():
from paperless_remote.parsers import RemoteDocumentParser
return RemoteDocumentParser(None).supported_mime_types()
def remote_consumer_declaration(sender, **kwargs):
return {
"parser": get_parser,
"weight": 5,
"mime_types": get_supported_mime_types(),
}

View File

@@ -1,29 +0,0 @@
from django.test import TestCase
from django.test import override_settings
from paperless_remote import check_remote_parser_configured
class TestChecks(TestCase):
@override_settings(REMOTE_OCR_ENGINE=None)
def test_no_engine(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 0)
@override_settings(REMOTE_OCR_ENGINE="azureai")
@override_settings(REMOTE_OCR_API_KEY="somekey")
@override_settings(REMOTE_OCR_ENDPOINT=None)
def test_azure_no_endpoint(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 1)
self.assertTrue(
msgs[0].msg.startswith(
"Azure AI remote parser requires endpoint to be configured.",
),
)
@override_settings(REMOTE_OCR_ENGINE="something")
@override_settings(REMOTE_OCR_API_KEY="somekey")
def test_valid_configuration(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 0)

View File

@@ -1,101 +0,0 @@
import uuid
from pathlib import Path
from unittest import mock
from django.test import TestCase
from django.test import override_settings
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless_remote.parsers import RemoteDocumentParser
from paperless_remote.signals import get_parser
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
def assertContainsStrings(self, content, strings):
# Asserts that all strings appear in content, in the given order.
indices = []
for s in strings:
if s in content:
indices.append(content.index(s))
else:
self.fail(f"'{s}' is not in '{content}'")
self.assertListEqual(indices, sorted(indices))
@mock.patch("paperless_tesseract.parsers.run_subprocess")
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
def test_get_text_with_azure(self, mock_client_cls, mock_subprocess):
# Arrange mock Azure client
mock_client = mock.Mock()
mock_client_cls.return_value = mock_client
# Simulate poller result and its `.details`
mock_poller = mock.Mock()
mock_poller.wait.return_value = None
mock_poller.details = {"operation_id": "fake-op-id"}
mock_client.begin_analyze_document.return_value = mock_poller
mock_poller.result.return_value.content = "This is a test document."
# Return dummy PDF bytes
mock_client.get_analyze_result_pdf.return_value = [
b"%PDF-",
b"1.7 ",
b"FAKEPDF",
]
# Simulate pdftotext by writing dummy text to sidecar file
def fake_run(cmd, *args, **kwargs):
with Path(cmd[-1]).open("w", encoding="utf-8") as f:
f.write("This is a test document.")
mock_subprocess.side_effect = fake_run
with override_settings(
REMOTE_OCR_ENGINE="azureai",
REMOTE_OCR_API_KEY="somekey",
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
):
parser = get_parser(uuid.uuid4())
parser.parse(
self.SAMPLE_FILES / "simple-digital.pdf",
"application/pdf",
)
self.assertContainsStrings(
parser.text.strip(),
["This is a test document."],
)
@override_settings(
REMOTE_OCR_ENGINE="azureai",
REMOTE_OCR_API_KEY="key",
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
)
def test_supported_mime_types_valid_config(self):
parser = RemoteDocumentParser(uuid.uuid4())
expected_types = [
"application/pdf",
"image/png",
"image/jpeg",
"image/tiff",
"image/bmp",
"image/gif",
"image/webp",
]
self.assertEqual(parser.supported_mime_types(), expected_types)
def test_supported_mime_types_invalid_config(self):
parser = get_parser(uuid.uuid4())
self.assertEqual(parser.supported_mime_types(), [])
@override_settings(
REMOTE_OCR_ENGINE=None,
REMOTE_OCR_API_KEY=None,
REMOTE_OCR_ENDPOINT=None,
)
def test_parse_with_invalid_config(self):
parser = get_parser(uuid.uuid4())
parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf")
self.assertEqual(parser.text, "")

73
uv.lock generated
View File

@@ -1,5 +1,5 @@
version = 1
revision = 3
revision = 2
requires-python = ">=3.10"
resolution-markers = [
"sys_platform == 'darwin'",
@@ -93,34 +93,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/af/cc/55a32a2c98022d88812b5986d2a92c4ff3ee087e83b712ebc703bba452bf/Automat-24.8.1-py3-none-any.whl", hash = "sha256:bf029a7bc3da1e2c24da2343e7598affaa9f10bf0ab63ff808566ce90551e02a", size = 42585, upload-time = "2024-08-19T17:31:56.729Z" },
]
[[package]]
name = "azure-ai-documentintelligence"
version = "1.0.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "azure-core", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "isodate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/44/7b/8115cd713e2caa5e44def85f2b7ebd02a74ae74d7113ba20bdd41fd6dd80/azure_ai_documentintelligence-1.0.2.tar.gz", hash = "sha256:4d75a2513f2839365ebabc0e0e1772f5601b3a8c9a71e75da12440da13b63484", size = 170940 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d9/75/c9ec040f23082f54ffb1977ff8f364c2d21c79a640a13d1c1809e7fd6b1a/azure_ai_documentintelligence-1.0.2-py3-none-any.whl", hash = "sha256:e1fb446abbdeccc9759d897898a0fe13141ed29f9ad11fc705f951925822ed59", size = 106005 },
]
[[package]]
name = "azure-core"
version = "1.33.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "six", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/75/aa/7c9db8edd626f1a7d99d09ef7926f6f4fb34d5f9fa00dc394afdfe8e2a80/azure_core-1.33.0.tar.gz", hash = "sha256:f367aa07b5e3005fec2c1e184b882b0b039910733907d001c20fb08ebb8c0eb9", size = 295633 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/07/b7/76b7e144aa53bd206bf1ce34fa75350472c3f69bf30e5c8c18bc9881035d/azure_core-1.33.0-py3-none-any.whl", hash = "sha256:9b5b6d0223a1d38c37500e6971118c1e0f13f54951e6893968b38910bc9cda8f", size = 207071 },
]
[[package]]
name = "babel"
version = "2.17.0"
@@ -340,15 +312,15 @@ wheels = [
[[package]]
name = "channels"
version = "4.3.1"
version = "4.3.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "asgiref", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "django", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/12/a0/46450fcf9e56af18a6b0440ba49db6635419bb7bc84142c35f4143b1a66c/channels-4.3.1.tar.gz", hash = "sha256:97413ffd674542db08e16a9ef09cd86ec0113e5f8125fbd33cf0854adcf27cdb", size = 26896, upload-time = "2025-08-01T13:25:19.952Z" }
sdist = { url = "https://files.pythonhosted.org/packages/72/04/6768c7a887f9c593c4d49f99130c8aec4ea06e750bc17c306b689f6caf3b/channels-4.3.0.tar.gz", hash = "sha256:7db32c61dcd88eada1647e6c6f6ad2eb724b75d4852eeff26ad1c51ccd1a37f7", size = 26816, upload-time = "2025-07-28T13:52:50.334Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/89/1c/eae1c2a8c195760376e7f65d0bdcc3e966695d29cfbe5c54841ce5c71408/channels-4.3.1-py3-none-any.whl", hash = "sha256:b091d4b26f91d807de3e84aead7ba785314f27eaf5bac31dd51b1c956b883859", size = 31286, upload-time = "2025-08-01T13:25:18.845Z" },
{ url = "https://files.pythonhosted.org/packages/7c/59/0866202ee593e1b0dab0b472ebb8169e1b2b7886ad3008d193da2bbe10cb/channels-4.3.0-py3-none-any.whl", hash = "sha256:0497f3affb95e621b37d6bae1b6a5d9e8e1e1221007a2566f280091cf30ffcce", size = 31238, upload-time = "2025-07-28T13:52:49.117Z" },
]
[[package]]
@@ -1411,15 +1383,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/c7/fc/4e5a141c3f7c7bed550ac1f69e599e92b6be449dd4677ec09f325cad0955/inotifyrecursive-0.3.5-py3-none-any.whl", hash = "sha256:7e5f4a2e1dc2bef0efa3b5f6b339c41fb4599055a2b54909d020e9e932cc8d2f", size = 8009, upload-time = "2020-11-20T12:38:46.981Z" },
]
[[package]]
name = "isodate"
version = "0.7.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320 },
]
[[package]]
name = "jinja2"
version = "3.1.6"
@@ -1948,7 +1911,6 @@ name = "paperless-ngx"
version = "2.17.1"
source = { virtual = "." }
dependencies = [
{ name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "channels", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -1984,7 +1946,6 @@ dependencies = [
{ name = "ocrmypdf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "pathvalidate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "pdf2image", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "psycopg-pool", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "python-dateutil", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "python-dotenv", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "python-gnupg", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -2014,7 +1975,7 @@ postgres = [
{ name = "psycopg-c", version = "3.2.9", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version != '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux') or (python_full_version != '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux') or (platform_machine != 'aarch64' and platform_machine != 'x86_64' and sys_platform == 'linux') or sys_platform == 'darwin'" },
{ name = "psycopg-c", version = "3.2.9", source = { url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_aarch64.whl" }, marker = "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux'" },
{ name = "psycopg-c", version = "3.2.9", source = { url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
{ name = "psycopg-pool", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "psycopg-pool", version = "3.2.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'darwin'" },
]
webserver = [
{ name = "granian", extra = ["uvloop"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -2082,7 +2043,6 @@ typing = [
[package.metadata]
requires-dist = [
{ name = "azure-ai-documentintelligence", specifier = ">=1.0.2" },
{ name = "bleach", specifier = "~=6.2.0" },
{ name = "celery", extras = ["redis"], specifier = "~=5.5.1" },
{ name = "channels", specifier = "~=4.2" },
@@ -2124,8 +2084,7 @@ requires-dist = [
{ name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'aarch64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_aarch64.whl" },
{ name = "psycopg-c", marker = "python_full_version == '3.12.*' and platform_machine == 'x86_64' and sys_platform == 'linux' and extra == 'postgres'", url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_x86_64.whl" },
{ name = "psycopg-c", marker = "(python_full_version != '3.12.*' and platform_machine == 'aarch64' and extra == 'postgres') or (python_full_version != '3.12.*' and platform_machine == 'x86_64' and extra == 'postgres') or (platform_machine != 'aarch64' and platform_machine != 'x86_64' and extra == 'postgres') or (sys_platform != 'linux' and extra == 'postgres')", specifier = "==3.2.9" },
{ name = "psycopg-pool" },
{ name = "psycopg-pool", marker = "extra == 'postgres'", specifier = "==3.2.6" },
{ name = "psycopg-pool", marker = "extra == 'postgres'" },
{ name = "python-dateutil", specifier = "~=2.9.0" },
{ name = "python-dotenv", specifier = "~=1.1.0" },
{ name = "python-gnupg", specifier = "~=0.5.4" },
@@ -2136,7 +2095,7 @@ requires-dist = [
{ name = "redis", extras = ["hiredis"], specifier = "~=5.2.1" },
{ name = "scikit-learn", specifier = "~=1.7.0" },
{ name = "setproctitle", specifier = "~=1.3.4" },
{ name = "tika-client", specifier = "~=0.10.0" },
{ name = "tika-client", specifier = "~=0.9.0" },
{ name = "tqdm", specifier = "~=4.67.1" },
{ name = "watchdog", specifier = "~=6.0" },
{ name = "whitenoise", specifier = "~=6.9" },
@@ -2477,7 +2436,7 @@ c = [
{ name = "psycopg-c", version = "3.2.9", source = { url = "https://github.com/paperless-ngx/builder/releases/download/psycopg-3.2.9/psycopg_c-3.2.9-cp312-cp312-linux_x86_64.whl" }, marker = "python_full_version == '3.12.*' and implementation_name != 'pypy' and platform_machine == 'x86_64' and sys_platform == 'linux'" },
]
pool = [
{ name = "psycopg-pool", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "psycopg-pool", version = "3.2.6", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'linux' or sys_platform == 'darwin'" },
]
[[package]]
@@ -2516,14 +2475,12 @@ wheels = [
name = "psycopg-pool"
version = "3.2.6"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/cf/13/1e7850bb2c69a63267c3dbf37387d3f71a00fd0e2fa55c5db14d64ba1af4/psycopg_pool-3.2.6.tar.gz", hash = "sha256:0f92a7817719517212fbfe2fd58b8c35c1850cdd2a80d36b581ba2085d9148e5", size = 29770, upload-time = "2025-02-26T12:03:47.129Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/47/fd/4feb52a55c1a4bd748f2acaed1903ab54a723c47f6d0242780f4d97104d4/psycopg_pool-3.2.6-py3-none-any.whl", hash = "sha256:5887318a9f6af906d041a0b1dc1c60f8f0dda8340c2572b74e10907b51ed5da7", size = 38252, upload-time = "2025-02-26T12:03:45.073Z" },
]
[[package]]
name = "pyasn1"
version = "0.6.1"
@@ -2747,11 +2704,11 @@ wheels = [
[[package]]
name = "python-gnupg"
version = "0.5.5"
version = "0.5.4"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/42/d0/72a14a79f26c6119b281f6ccc475a787432ef155560278e60df97ce68a86/python-gnupg-0.5.5.tar.gz", hash = "sha256:3fdcaf76f60a1b948ff8e37dc398d03cf9ce7427065d583082b92da7a4ff5a63", size = 66467, upload-time = "2025-08-04T19:26:55.778Z" }
sdist = { url = "https://files.pythonhosted.org/packages/f1/3e/ba0dc69c9f4e0aeb24d93175230ef057c151790a7516012f61014918992d/python-gnupg-0.5.4.tar.gz", hash = "sha256:f2fdb5fb29615c77c2743e1cb3d9314353a6e87b10c37d238d91ae1c6feae086", size = 65705, upload-time = "2025-01-07T11:58:34.073Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/aa/19/c147f78cc18c8788f54d4a16a22f6c05deba85ead5672d3ddf6dcba5a5fe/python_gnupg-0.5.5-py2.py3-none-any.whl", hash = "sha256:51fa7b8831ff0914bc73d74c59b99c613de7247b91294323c39733bb85ac3fc1", size = 21916, upload-time = "2025-08-04T19:26:54.307Z" },
{ url = "https://files.pythonhosted.org/packages/7b/5b/6666ed5a0d3ce4d5444af62e373d5ba8ab253a03487c86f2f9f1078e7c31/python_gnupg-0.5.4-py2.py3-none-any.whl", hash = "sha256:40ce25cde9df29af91fe931ce9df3ce544e14a37f62b13ca878c897217b2de6c", size = 21730, upload-time = "2025-01-07T11:58:32.249Z" },
]
[[package]]
@@ -3397,16 +3354,16 @@ wheels = [
[[package]]
name = "tika-client"
version = "0.10.0"
version = "0.9.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "anyio", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "httpx", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "typing-extensions", marker = "(python_full_version < '3.11' and sys_platform == 'darwin') or (python_full_version < '3.11' and sys_platform == 'linux')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/21/be/65bfc47e4689ecd5ead20cf47dc0084fd767b7e71e8cfabf5fddc42aae3c/tika_client-0.10.0.tar.gz", hash = "sha256:3101e8b2482ae4cb7f87be13ada970ff691bdc3404d94cd52f5e57a09c99370c", size = 2178257, upload-time = "2025-08-04T17:47:30.414Z" }
sdist = { url = "https://files.pythonhosted.org/packages/94/ad/3508e42b470a037b3f5c19ca9993893d0faa30ba7ec7e6ac33db9bc3bf51/tika_client-0.9.0.tar.gz", hash = "sha256:c10bba8e40ede23c039f84ccd821fb2d290d339cc26cbd267ab9b561a1e83659", size = 2175246, upload-time = "2025-01-15T18:46:23.901Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/b1/31/002e0fa5bca67d6a19da8c294273486f6c46cbcc83d6879719a38a181461/tika_client-0.10.0-py3-none-any.whl", hash = "sha256:f5486cc884e4522575662aa295bda761bf9f101ac8d92840155b58ab8b96f6e2", size = 18237, upload-time = "2025-08-04T17:47:28.966Z" },
{ url = "https://files.pythonhosted.org/packages/36/8c/90ba51e014fb548ee34dd5ed14e85ec4a205ff97b89ca393e4de321304ac/tika_client-0.9.0-py3-none-any.whl", hash = "sha256:2464e8335b5e92c276641c729e7707f1e894a2bfb51cc59abdd3bdfb532da8a0", size = 17963, upload-time = "2025-01-15T18:46:21.143Z" },
]
[[package]]