mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-09-26 01:12:43 -05:00
Compare commits
4 Commits
feature-re
...
dependabot
Author | SHA1 | Date | |
---|---|---|---|
![]() |
8e19d009d9 | ||
![]() |
764ad059d1 | ||
![]() |
5e47069934 | ||
![]() |
4ff09c4cf4 |
91
.github/workflows/ci.yml
vendored
91
.github/workflows/ci.yml
vendored
@@ -192,18 +192,6 @@ jobs:
|
|||||||
token: ${{ secrets.CODECOV_TOKEN }}
|
token: ${{ secrets.CODECOV_TOKEN }}
|
||||||
flags: backend-python-${{ matrix.python-version }}
|
flags: backend-python-${{ matrix.python-version }}
|
||||||
files: coverage.xml
|
files: coverage.xml
|
||||||
- name: Upload coverage artifacts
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
if: always()
|
|
||||||
with:
|
|
||||||
name: backend-coverage-${{ matrix.python-version }}
|
|
||||||
path: |
|
|
||||||
.coverage
|
|
||||||
coverage.xml
|
|
||||||
junit.xml
|
|
||||||
retention-days: 1
|
|
||||||
include-hidden-files: true
|
|
||||||
if-no-files-found: error
|
|
||||||
- name: Stop containers
|
- name: Stop containers
|
||||||
if: always()
|
if: always()
|
||||||
run: |
|
run: |
|
||||||
@@ -286,17 +274,6 @@ jobs:
|
|||||||
token: ${{ secrets.CODECOV_TOKEN }}
|
token: ${{ secrets.CODECOV_TOKEN }}
|
||||||
flags: frontend-node-${{ matrix.node-version }}
|
flags: frontend-node-${{ matrix.node-version }}
|
||||||
directory: src-ui/coverage/
|
directory: src-ui/coverage/
|
||||||
- name: Upload coverage artifacts
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
if: always()
|
|
||||||
with:
|
|
||||||
name: frontend-coverage-${{ matrix.shard-index }}
|
|
||||||
path: |
|
|
||||||
src-ui/coverage/lcov.info
|
|
||||||
src-ui/coverage/coverage-final.json
|
|
||||||
src-ui/junit.xml
|
|
||||||
retention-days: 1
|
|
||||||
if-no-files-found: error
|
|
||||||
tests-frontend-e2e:
|
tests-frontend-e2e:
|
||||||
name: "Frontend E2E Tests (Node ${{ matrix.node-version }} - ${{ matrix.shard-index }}/${{ matrix.shard-count }})"
|
name: "Frontend E2E Tests (Node ${{ matrix.node-version }} - ${{ matrix.shard-index }}/${{ matrix.shard-count }})"
|
||||||
runs-on: ubuntu-24.04
|
runs-on: ubuntu-24.04
|
||||||
@@ -377,74 +354,6 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
||||||
run: cd src-ui && pnpm run build --configuration=production
|
run: cd src-ui && pnpm run build --configuration=production
|
||||||
sonarqube-analysis:
|
|
||||||
name: "SonarQube Analysis"
|
|
||||||
runs-on: ubuntu-24.04
|
|
||||||
needs:
|
|
||||||
- tests-backend
|
|
||||||
- tests-frontend
|
|
||||||
if: github.repository_owner == 'paperless-ngx'
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v5
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
- name: Download all backend coverage
|
|
||||||
uses: actions/download-artifact@v5.0.0
|
|
||||||
with:
|
|
||||||
pattern: backend-coverage-*
|
|
||||||
path: ./coverage/
|
|
||||||
- name: Download all frontend coverage
|
|
||||||
uses: actions/download-artifact@v5.0.0
|
|
||||||
with:
|
|
||||||
pattern: frontend-coverage-*
|
|
||||||
path: ./coverage/
|
|
||||||
- name: Set up Python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: ${{ env.DEFAULT_PYTHON_VERSION }}
|
|
||||||
- name: Install coverage tools
|
|
||||||
run: |
|
|
||||||
pip install coverage
|
|
||||||
npm install -g nyc
|
|
||||||
# Merge backend coverage from all Python versions
|
|
||||||
- name: Merge backend coverage
|
|
||||||
run: |
|
|
||||||
coverage combine coverage/backend-coverage-*/.coverage
|
|
||||||
coverage xml -o merged-backend-coverage.xml
|
|
||||||
# Merge frontend coverage from all shards
|
|
||||||
- name: Merge frontend coverage
|
|
||||||
run: |
|
|
||||||
# Find all coverage-final.json files from the shards, exit with error if none found
|
|
||||||
shopt -s nullglob
|
|
||||||
files=(coverage/frontend-coverage-*/coverage/coverage-final.json)
|
|
||||||
if [ ${#files[@]} -eq 0 ]; then
|
|
||||||
echo "No frontend coverage JSON found under coverage/" >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
# Create .nyc_output directory and copy each shard's coverage JSON into it with a unique name
|
|
||||||
mkdir -p .nyc_output
|
|
||||||
for coverage_json in "${files[@]}"; do
|
|
||||||
shard=$(basename "$(dirname "$(dirname "$coverage_json")")")
|
|
||||||
cp "$coverage_json" ".nyc_output/${shard}.json"
|
|
||||||
done
|
|
||||||
npx nyc merge .nyc_output .nyc_output/out.json
|
|
||||||
npx nyc report --reporter=lcovonly --report-dir coverage
|
|
||||||
- name: Upload coverage artifacts
|
|
||||||
uses: actions/upload-artifact@v4.6.2
|
|
||||||
with:
|
|
||||||
name: merged-coverage
|
|
||||||
path: |
|
|
||||||
merged-backend-coverage.xml
|
|
||||||
.nyc_output/*
|
|
||||||
coverage/lcov.info
|
|
||||||
retention-days: 7
|
|
||||||
if-no-files-found: error
|
|
||||||
include-hidden-files: true
|
|
||||||
- name: SonarQube Analysis
|
|
||||||
uses: SonarSource/sonarqube-scan-action@v5
|
|
||||||
env:
|
|
||||||
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
|
|
||||||
build-docker-image:
|
build-docker-image:
|
||||||
name: Build Docker image for ${{ github.ref_name }}
|
name: Build Docker image for ${{ github.ref_name }}
|
||||||
runs-on: ubuntu-24.04
|
runs-on: ubuntu-24.04
|
||||||
|
@@ -1805,23 +1805,3 @@ password. All of these options come from their similarly-named [Django settings]
|
|||||||
#### [`PAPERLESS_EMAIL_USE_SSL=<bool>`](#PAPERLESS_EMAIL_USE_SSL) {#PAPERLESS_EMAIL_USE_SSL}
|
#### [`PAPERLESS_EMAIL_USE_SSL=<bool>`](#PAPERLESS_EMAIL_USE_SSL) {#PAPERLESS_EMAIL_USE_SSL}
|
||||||
|
|
||||||
: Defaults to false.
|
: Defaults to false.
|
||||||
|
|
||||||
## Remote OCR
|
|
||||||
|
|
||||||
#### [`PAPERLESS_REMOTE_OCR_ENGINE=<str>`](#PAPERLESS_REMOTE_OCR_ENGINE) {#PAPERLESS_REMOTE_OCR_ENGINE}
|
|
||||||
|
|
||||||
: The remote OCR engine to use. Currently only Azure AI is supported as "azureai".
|
|
||||||
|
|
||||||
Defaults to None, which disables remote OCR.
|
|
||||||
|
|
||||||
#### [`PAPERLESS_REMOTE_OCR_API_KEY=<str>`](#PAPERLESS_REMOTE_OCR_API_KEY) {#PAPERLESS_REMOTE_OCR_API_KEY}
|
|
||||||
|
|
||||||
: The API key to use for the remote OCR engine.
|
|
||||||
|
|
||||||
Defaults to None.
|
|
||||||
|
|
||||||
#### [`PAPERLESS_REMOTE_OCR_ENDPOINT=<str>`](#PAPERLESS_REMOTE_OCR_ENDPOINT) {#PAPERLESS_REMOTE_OCR_ENDPOINT}
|
|
||||||
|
|
||||||
: The endpoint to use for the remote OCR engine. This is required for Azure AI.
|
|
||||||
|
|
||||||
Defaults to None.
|
|
||||||
|
@@ -25,10 +25,9 @@ physical documents into a searchable online archive so you can keep, well, _less
|
|||||||
## Features
|
## Features
|
||||||
|
|
||||||
- **Organize and index** your scanned documents with tags, correspondents, types, and more.
|
- **Organize and index** your scanned documents with tags, correspondents, types, and more.
|
||||||
- _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way, unless you explicitly choose to do so.
|
- _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way.
|
||||||
- Performs **OCR** on your documents, adding searchable and selectable text, even to documents scanned with only images.
|
- Performs **OCR** on your documents, adding searchable and selectable text, even to documents scanned with only images.
|
||||||
- Utilizes the open-source Tesseract engine to recognize more than 100 languages.
|
- Utilizes the open-source Tesseract engine to recognize more than 100 languages.
|
||||||
- _New!_ Supports remote OCR with Azure AI (opt-in).
|
|
||||||
- Documents are saved as PDF/A format which is designed for long term storage, alongside the unaltered originals.
|
- Documents are saved as PDF/A format which is designed for long term storage, alongside the unaltered originals.
|
||||||
- Uses machine-learning to automatically add tags, correspondents and document types to your documents.
|
- Uses machine-learning to automatically add tags, correspondents and document types to your documents.
|
||||||
- Supports PDF documents, images, plain text files, Office documents (Word, Excel, PowerPoint, and LibreOffice equivalents)[^1] and more.
|
- Supports PDF documents, images, plain text files, Office documents (Word, Excel, PowerPoint, and LibreOffice equivalents)[^1] and more.
|
||||||
|
@@ -882,21 +882,6 @@ how regularly you intend to scan documents and use paperless.
|
|||||||
performed the task associated with the document, move it to the
|
performed the task associated with the document, move it to the
|
||||||
inbox.
|
inbox.
|
||||||
|
|
||||||
## Remote OCR
|
|
||||||
|
|
||||||
!!! important
|
|
||||||
|
|
||||||
This feature is disabled by default and will always remain strictly "opt-in".
|
|
||||||
|
|
||||||
Paperless-ngx supports performing OCR on documents using remote services. At the moment, this is limited to
|
|
||||||
[Microsoft's Azure "Document Intelligence" service](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence).
|
|
||||||
This is of course a paid service (with a free tier) which requires an Azure account and subscription. Azure AI is not affiliated with
|
|
||||||
Paperless-ngx in any way. When enabled, Paperless-ngx will automatically send appropriate documents to Azure for OCR processing, bypassing
|
|
||||||
the local OCR engine. See the [configuration](configuration.md#PAPERLESS_REMOTE_OCR_ENGINE) options for more details.
|
|
||||||
|
|
||||||
Additionally, when using a commercial service with this feature, consider both potential costs as well as any associated file size
|
|
||||||
or page limitations (e.g. with a free tier).
|
|
||||||
|
|
||||||
## Architecture
|
## Architecture
|
||||||
|
|
||||||
Paperless-ngx consists of the following components:
|
Paperless-ngx consists of the following components:
|
||||||
|
@@ -15,7 +15,6 @@ classifiers = [
|
|||||||
# This will allow testing to not install a webserver, mysql, etc
|
# This will allow testing to not install a webserver, mysql, etc
|
||||||
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"azure-ai-documentintelligence>=1.0.2",
|
|
||||||
"babel>=2.17",
|
"babel>=2.17",
|
||||||
"bleach~=6.2.0",
|
"bleach~=6.2.0",
|
||||||
"celery[redis]~=5.5.1",
|
"celery[redis]~=5.5.1",
|
||||||
@@ -34,7 +33,7 @@ dependencies = [
|
|||||||
"django-cors-headers~=4.8.0",
|
"django-cors-headers~=4.8.0",
|
||||||
"django-extensions~=4.1",
|
"django-extensions~=4.1",
|
||||||
"django-filter~=25.1",
|
"django-filter~=25.1",
|
||||||
"django-guardian~=3.1.2",
|
"django-guardian~=3.2.0",
|
||||||
"django-multiselectfield~=1.0.1",
|
"django-multiselectfield~=1.0.1",
|
||||||
"django-soft-delete~=1.0.18",
|
"django-soft-delete~=1.0.18",
|
||||||
"django-treenode>=0.23.2",
|
"django-treenode>=0.23.2",
|
||||||
@@ -234,7 +233,6 @@ testpaths = [
|
|||||||
"src/paperless_tesseract/tests/",
|
"src/paperless_tesseract/tests/",
|
||||||
"src/paperless_tika/tests",
|
"src/paperless_tika/tests",
|
||||||
"src/paperless_text/tests/",
|
"src/paperless_text/tests/",
|
||||||
"src/paperless_remote/tests/",
|
|
||||||
]
|
]
|
||||||
addopts = [
|
addopts = [
|
||||||
"--pythonwarnings=all",
|
"--pythonwarnings=all",
|
||||||
@@ -257,7 +255,6 @@ PAPERLESS_DISABLE_DBHANDLER = "true"
|
|||||||
PAPERLESS_CACHE_BACKEND = "django.core.cache.backends.locmem.LocMemCache"
|
PAPERLESS_CACHE_BACKEND = "django.core.cache.backends.locmem.LocMemCache"
|
||||||
|
|
||||||
[tool.coverage.run]
|
[tool.coverage.run]
|
||||||
relative_files = true
|
|
||||||
source = [
|
source = [
|
||||||
"src/",
|
"src/",
|
||||||
]
|
]
|
||||||
|
@@ -1,24 +0,0 @@
|
|||||||
sonar.projectKey=paperless-ngx_paperless-ngx
|
|
||||||
sonar.organization=paperless-ngx
|
|
||||||
sonar.projectName=Paperless-ngx
|
|
||||||
sonar.projectVersion=1.0
|
|
||||||
|
|
||||||
# Source and test directories
|
|
||||||
sonar.sources=src/,src-ui/
|
|
||||||
sonar.test.inclusions=**/test_*.py,**/tests.py,**/*.spec.ts,**/*.test.ts
|
|
||||||
|
|
||||||
# Language specific settings
|
|
||||||
sonar.python.version=3.10,3.11,3.12,3.13
|
|
||||||
|
|
||||||
# Coverage reports
|
|
||||||
sonar.python.coverage.reportPaths=merged-backend-coverage.xml
|
|
||||||
sonar.javascript.lcov.reportPaths=coverage/lcov.info
|
|
||||||
|
|
||||||
# Test execution reports
|
|
||||||
sonar.junit.reportPaths=**/junit.xml,**/test-results.xml
|
|
||||||
|
|
||||||
# Encoding
|
|
||||||
sonar.sourceEncoding=UTF-8
|
|
||||||
|
|
||||||
# Exclusions
|
|
||||||
sonar.exclusions=**/migrations/**,**/node_modules/**,**/static/**,**/venv/**,**/.venv/**,**/dist/**
|
|
@@ -177,10 +177,16 @@ export class CustomFieldEditDialogComponent
|
|||||||
}
|
}
|
||||||
|
|
||||||
public removeSelectOption(index: number) {
|
public removeSelectOption(index: number) {
|
||||||
this.selectOptions.removeAt(index)
|
const globalIndex =
|
||||||
this._allSelectOptions.splice(
|
index + (this.selectOptionsPage - 1) * SELECT_OPTION_PAGE_SIZE
|
||||||
index + (this.selectOptionsPage - 1) * SELECT_OPTION_PAGE_SIZE,
|
this._allSelectOptions.splice(globalIndex, 1)
|
||||||
1
|
|
||||||
|
const totalPages = Math.max(
|
||||||
|
1,
|
||||||
|
Math.ceil(this._allSelectOptions.length / SELECT_OPTION_PAGE_SIZE)
|
||||||
)
|
)
|
||||||
|
const targetPage = Math.min(this.selectOptionsPage, totalPages)
|
||||||
|
|
||||||
|
this.selectOptionsPage = targetPage
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@@ -164,6 +164,9 @@ class BarcodePlugin(ConsumeTaskPlugin):
|
|||||||
mailrule_id=self.input_doc.mailrule_id,
|
mailrule_id=self.input_doc.mailrule_id,
|
||||||
# Can't use same folder or the consume might grab it again
|
# Can't use same folder or the consume might grab it again
|
||||||
original_file=(tmp_dir / new_document.name).resolve(),
|
original_file=(tmp_dir / new_document.name).resolve(),
|
||||||
|
# Adding optional original_path for later uses in
|
||||||
|
# workflow matching
|
||||||
|
original_path=self.input_doc.original_file,
|
||||||
),
|
),
|
||||||
# All the same metadata
|
# All the same metadata
|
||||||
self.metadata,
|
self.metadata,
|
||||||
|
@@ -156,6 +156,7 @@ class ConsumableDocument:
|
|||||||
|
|
||||||
source: DocumentSource
|
source: DocumentSource
|
||||||
original_file: Path
|
original_file: Path
|
||||||
|
original_path: Path | None = None
|
||||||
mailrule_id: int | None = None
|
mailrule_id: int | None = None
|
||||||
mime_type: str = dataclasses.field(init=False, default=None)
|
mime_type: str = dataclasses.field(init=False, default=None)
|
||||||
|
|
||||||
|
@@ -314,11 +314,19 @@ def consumable_document_matches_workflow(
|
|||||||
trigger_matched = False
|
trigger_matched = False
|
||||||
|
|
||||||
# Document path vs trigger path
|
# Document path vs trigger path
|
||||||
|
|
||||||
|
# Use the original_path if set, else us the original_file
|
||||||
|
match_against = (
|
||||||
|
document.original_path
|
||||||
|
if document.original_path is not None
|
||||||
|
else document.original_file
|
||||||
|
)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
trigger.filter_path is not None
|
trigger.filter_path is not None
|
||||||
and len(trigger.filter_path) > 0
|
and len(trigger.filter_path) > 0
|
||||||
and not fnmatch(
|
and not fnmatch(
|
||||||
document.original_file,
|
match_against,
|
||||||
trigger.filter_path,
|
trigger.filter_path,
|
||||||
)
|
)
|
||||||
):
|
):
|
||||||
|
@@ -614,14 +614,16 @@ class TestBarcodeNewConsume(
|
|||||||
self.assertIsNotFile(temp_copy)
|
self.assertIsNotFile(temp_copy)
|
||||||
|
|
||||||
# Check the split files exist
|
# Check the split files exist
|
||||||
|
# Check the original_path is set
|
||||||
# Check the source is unchanged
|
# Check the source is unchanged
|
||||||
# Check the overrides are unchanged
|
# Check the overrides are unchanged
|
||||||
for (
|
for (
|
||||||
new_input_doc,
|
new_input_doc,
|
||||||
new_doc_overrides,
|
new_doc_overrides,
|
||||||
) in self.get_all_consume_delay_call_args():
|
) in self.get_all_consume_delay_call_args():
|
||||||
self.assertEqual(new_input_doc.source, DocumentSource.ConsumeFolder)
|
|
||||||
self.assertIsFile(new_input_doc.original_file)
|
self.assertIsFile(new_input_doc.original_file)
|
||||||
|
self.assertEqual(new_input_doc.original_path, temp_copy)
|
||||||
|
self.assertEqual(new_input_doc.source, DocumentSource.ConsumeFolder)
|
||||||
self.assertEqual(overrides, new_doc_overrides)
|
self.assertEqual(overrides, new_doc_overrides)
|
||||||
|
|
||||||
|
|
||||||
|
@@ -322,7 +322,6 @@ INSTALLED_APPS = [
|
|||||||
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
"paperless_tesseract.apps.PaperlessTesseractConfig",
|
||||||
"paperless_text.apps.PaperlessTextConfig",
|
"paperless_text.apps.PaperlessTextConfig",
|
||||||
"paperless_mail.apps.PaperlessMailConfig",
|
"paperless_mail.apps.PaperlessMailConfig",
|
||||||
"paperless_remote.apps.PaperlessRemoteParserConfig",
|
|
||||||
"django.contrib.admin",
|
"django.contrib.admin",
|
||||||
"rest_framework",
|
"rest_framework",
|
||||||
"rest_framework.authtoken",
|
"rest_framework.authtoken",
|
||||||
@@ -1390,10 +1389,3 @@ WEBHOOKS_ALLOW_INTERNAL_REQUESTS = __get_boolean(
|
|||||||
"PAPERLESS_WEBHOOKS_ALLOW_INTERNAL_REQUESTS",
|
"PAPERLESS_WEBHOOKS_ALLOW_INTERNAL_REQUESTS",
|
||||||
"true",
|
"true",
|
||||||
)
|
)
|
||||||
|
|
||||||
###############################################################################
|
|
||||||
# Remote Parser #
|
|
||||||
###############################################################################
|
|
||||||
REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
|
|
||||||
REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
|
|
||||||
REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")
|
|
||||||
|
@@ -1,4 +0,0 @@
|
|||||||
# this is here so that django finds the checks.
|
|
||||||
from paperless_remote.checks import check_remote_parser_configured
|
|
||||||
|
|
||||||
__all__ = ["check_remote_parser_configured"]
|
|
@@ -1,14 +0,0 @@
|
|||||||
from django.apps import AppConfig
|
|
||||||
|
|
||||||
from paperless_remote.signals import remote_consumer_declaration
|
|
||||||
|
|
||||||
|
|
||||||
class PaperlessRemoteParserConfig(AppConfig):
|
|
||||||
name = "paperless_remote"
|
|
||||||
|
|
||||||
def ready(self):
|
|
||||||
from documents.signals import document_consumer_declaration
|
|
||||||
|
|
||||||
document_consumer_declaration.connect(remote_consumer_declaration)
|
|
||||||
|
|
||||||
AppConfig.ready(self)
|
|
@@ -1,17 +0,0 @@
|
|||||||
from django.conf import settings
|
|
||||||
from django.core.checks import Error
|
|
||||||
from django.core.checks import register
|
|
||||||
|
|
||||||
|
|
||||||
@register()
|
|
||||||
def check_remote_parser_configured(app_configs, **kwargs):
|
|
||||||
if settings.REMOTE_OCR_ENGINE == "azureai" and not (
|
|
||||||
settings.REMOTE_OCR_ENDPOINT and settings.REMOTE_OCR_API_KEY
|
|
||||||
):
|
|
||||||
return [
|
|
||||||
Error(
|
|
||||||
"Azure AI remote parser requires endpoint and API key to be configured.",
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
return []
|
|
@@ -1,113 +0,0 @@
|
|||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from django.conf import settings
|
|
||||||
|
|
||||||
from paperless_tesseract.parsers import RasterisedDocumentParser
|
|
||||||
|
|
||||||
|
|
||||||
class RemoteEngineConfig:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
engine: str,
|
|
||||||
api_key: str | None = None,
|
|
||||||
endpoint: str | None = None,
|
|
||||||
):
|
|
||||||
self.engine = engine
|
|
||||||
self.api_key = api_key
|
|
||||||
self.endpoint = endpoint
|
|
||||||
|
|
||||||
def engine_is_valid(self):
|
|
||||||
valid = self.engine in ["azureai"] and self.api_key is not None
|
|
||||||
if self.engine == "azureai":
|
|
||||||
valid = valid and self.endpoint is not None
|
|
||||||
return valid
|
|
||||||
|
|
||||||
|
|
||||||
class RemoteDocumentParser(RasterisedDocumentParser):
|
|
||||||
"""
|
|
||||||
This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
|
|
||||||
as this is the only service that provides a remote OCR API with text-embedded PDF output.
|
|
||||||
"""
|
|
||||||
|
|
||||||
logging_name = "paperless.parsing.remote"
|
|
||||||
|
|
||||||
def get_settings(self) -> RemoteEngineConfig:
|
|
||||||
"""
|
|
||||||
Returns the configuration for the remote OCR engine, loaded from Django settings.
|
|
||||||
"""
|
|
||||||
return RemoteEngineConfig(
|
|
||||||
engine=settings.REMOTE_OCR_ENGINE,
|
|
||||||
api_key=settings.REMOTE_OCR_API_KEY,
|
|
||||||
endpoint=settings.REMOTE_OCR_ENDPOINT,
|
|
||||||
)
|
|
||||||
|
|
||||||
def supported_mime_types(self):
|
|
||||||
if self.settings.engine_is_valid():
|
|
||||||
return {
|
|
||||||
"application/pdf": ".pdf",
|
|
||||||
"image/png": ".png",
|
|
||||||
"image/jpeg": ".jpg",
|
|
||||||
"image/tiff": ".tiff",
|
|
||||||
"image/bmp": ".bmp",
|
|
||||||
"image/gif": ".gif",
|
|
||||||
"image/webp": ".webp",
|
|
||||||
}
|
|
||||||
else:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def azure_ai_vision_parse(
|
|
||||||
self,
|
|
||||||
file: Path,
|
|
||||||
) -> str | None:
|
|
||||||
"""
|
|
||||||
Uses Azure AI Vision to parse the document and return the text content.
|
|
||||||
It requests a searchable PDF output with embedded text.
|
|
||||||
The PDF is saved to the archive_path attribute.
|
|
||||||
Returns the text content extracted from the document.
|
|
||||||
If the parsing fails, it returns None.
|
|
||||||
"""
|
|
||||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
|
||||||
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
|
|
||||||
from azure.ai.documentintelligence.models import AnalyzeOutputOption
|
|
||||||
from azure.ai.documentintelligence.models import DocumentContentFormat
|
|
||||||
from azure.core.credentials import AzureKeyCredential
|
|
||||||
|
|
||||||
client = DocumentIntelligenceClient(
|
|
||||||
endpoint=self.settings.endpoint,
|
|
||||||
credential=AzureKeyCredential(self.settings.api_key),
|
|
||||||
)
|
|
||||||
|
|
||||||
with file.open("rb") as f:
|
|
||||||
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
|
|
||||||
poller = client.begin_analyze_document(
|
|
||||||
model_id="prebuilt-read",
|
|
||||||
body=analyze_request,
|
|
||||||
output_content_format=DocumentContentFormat.TEXT,
|
|
||||||
output=[AnalyzeOutputOption.PDF], # request searchable PDF output
|
|
||||||
content_type="application/json",
|
|
||||||
)
|
|
||||||
|
|
||||||
poller.wait()
|
|
||||||
result_id = poller.details["operation_id"]
|
|
||||||
result = poller.result()
|
|
||||||
|
|
||||||
# Download the PDF with embedded text
|
|
||||||
self.archive_path = self.tempdir / "archive.pdf"
|
|
||||||
with self.archive_path.open("wb") as f:
|
|
||||||
for chunk in client.get_analyze_result_pdf(
|
|
||||||
model_id="prebuilt-read",
|
|
||||||
result_id=result_id,
|
|
||||||
):
|
|
||||||
f.write(chunk)
|
|
||||||
|
|
||||||
client.close()
|
|
||||||
return result.content
|
|
||||||
|
|
||||||
def parse(self, document_path: Path, mime_type, file_name=None):
|
|
||||||
if not self.settings.engine_is_valid():
|
|
||||||
self.log.warning(
|
|
||||||
"No valid remote parser engine is configured, content will be empty.",
|
|
||||||
)
|
|
||||||
self.text = ""
|
|
||||||
elif self.settings.engine == "azureai":
|
|
||||||
self.text = self.azure_ai_vision_parse(document_path)
|
|
@@ -1,18 +0,0 @@
|
|||||||
def get_parser(*args, **kwargs):
|
|
||||||
from paperless_remote.parsers import RemoteDocumentParser
|
|
||||||
|
|
||||||
return RemoteDocumentParser(*args, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def get_supported_mime_types():
|
|
||||||
from paperless_remote.parsers import RemoteDocumentParser
|
|
||||||
|
|
||||||
return RemoteDocumentParser(None).supported_mime_types()
|
|
||||||
|
|
||||||
|
|
||||||
def remote_consumer_declaration(sender, **kwargs):
|
|
||||||
return {
|
|
||||||
"parser": get_parser,
|
|
||||||
"weight": 5,
|
|
||||||
"mime_types": get_supported_mime_types(),
|
|
||||||
}
|
|
Binary file not shown.
@@ -1,24 +0,0 @@
|
|||||||
from unittest import TestCase
|
|
||||||
|
|
||||||
from django.test import override_settings
|
|
||||||
|
|
||||||
from paperless_remote import check_remote_parser_configured
|
|
||||||
|
|
||||||
|
|
||||||
class TestChecks(TestCase):
|
|
||||||
@override_settings(REMOTE_OCR_ENGINE=None)
|
|
||||||
def test_no_engine(self):
|
|
||||||
msgs = check_remote_parser_configured(None)
|
|
||||||
self.assertEqual(len(msgs), 0)
|
|
||||||
|
|
||||||
@override_settings(REMOTE_OCR_ENGINE="azureai")
|
|
||||||
@override_settings(REMOTE_OCR_API_KEY="somekey")
|
|
||||||
@override_settings(REMOTE_OCR_ENDPOINT=None)
|
|
||||||
def test_azure_no_endpoint(self):
|
|
||||||
msgs = check_remote_parser_configured(None)
|
|
||||||
self.assertEqual(len(msgs), 1)
|
|
||||||
self.assertTrue(
|
|
||||||
msgs[0].msg.startswith(
|
|
||||||
"Azure AI remote parser requires endpoint and API key to be configured.",
|
|
||||||
),
|
|
||||||
)
|
|
@@ -1,101 +0,0 @@
|
|||||||
import uuid
|
|
||||||
from pathlib import Path
|
|
||||||
from unittest import mock
|
|
||||||
|
|
||||||
from django.test import TestCase
|
|
||||||
from django.test import override_settings
|
|
||||||
|
|
||||||
from documents.tests.utils import DirectoriesMixin
|
|
||||||
from documents.tests.utils import FileSystemAssertsMixin
|
|
||||||
from paperless_remote.parsers import RemoteDocumentParser
|
|
||||||
from paperless_remote.signals import get_parser
|
|
||||||
|
|
||||||
|
|
||||||
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
|
||||||
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
|
|
||||||
|
|
||||||
def assertContainsStrings(self, content: str, strings: list[str]):
|
|
||||||
# Asserts that all strings appear in content, in the given order.
|
|
||||||
indices = []
|
|
||||||
for s in strings:
|
|
||||||
if s in content:
|
|
||||||
indices.append(content.index(s))
|
|
||||||
else:
|
|
||||||
self.fail(f"'{s}' is not in '{content}'")
|
|
||||||
self.assertListEqual(indices, sorted(indices))
|
|
||||||
|
|
||||||
@mock.patch("paperless_tesseract.parsers.run_subprocess")
|
|
||||||
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
|
|
||||||
def test_get_text_with_azure(self, mock_client_cls, mock_subprocess):
|
|
||||||
# Arrange mock Azure client
|
|
||||||
mock_client = mock.Mock()
|
|
||||||
mock_client_cls.return_value = mock_client
|
|
||||||
|
|
||||||
# Simulate poller result and its `.details`
|
|
||||||
mock_poller = mock.Mock()
|
|
||||||
mock_poller.wait.return_value = None
|
|
||||||
mock_poller.details = {"operation_id": "fake-op-id"}
|
|
||||||
mock_client.begin_analyze_document.return_value = mock_poller
|
|
||||||
mock_poller.result.return_value.content = "This is a test document."
|
|
||||||
|
|
||||||
# Return dummy PDF bytes
|
|
||||||
mock_client.get_analyze_result_pdf.return_value = [
|
|
||||||
b"%PDF-",
|
|
||||||
b"1.7 ",
|
|
||||||
b"FAKEPDF",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Simulate pdftotext by writing dummy text to sidecar file
|
|
||||||
def fake_run(cmd, *args, **kwargs):
|
|
||||||
with Path(cmd[-1]).open("w", encoding="utf-8") as f:
|
|
||||||
f.write("This is a test document.")
|
|
||||||
|
|
||||||
mock_subprocess.side_effect = fake_run
|
|
||||||
|
|
||||||
with override_settings(
|
|
||||||
REMOTE_OCR_ENGINE="azureai",
|
|
||||||
REMOTE_OCR_API_KEY="somekey",
|
|
||||||
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
|
|
||||||
):
|
|
||||||
parser = get_parser(uuid.uuid4())
|
|
||||||
parser.parse(
|
|
||||||
self.SAMPLE_FILES / "simple-digital.pdf",
|
|
||||||
"application/pdf",
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertContainsStrings(
|
|
||||||
parser.text.strip(),
|
|
||||||
["This is a test document."],
|
|
||||||
)
|
|
||||||
|
|
||||||
@override_settings(
|
|
||||||
REMOTE_OCR_ENGINE="azureai",
|
|
||||||
REMOTE_OCR_API_KEY="key",
|
|
||||||
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
|
|
||||||
)
|
|
||||||
def test_supported_mime_types_valid_config(self):
|
|
||||||
parser = RemoteDocumentParser(uuid.uuid4())
|
|
||||||
expected_types = {
|
|
||||||
"application/pdf": ".pdf",
|
|
||||||
"image/png": ".png",
|
|
||||||
"image/jpeg": ".jpg",
|
|
||||||
"image/tiff": ".tiff",
|
|
||||||
"image/bmp": ".bmp",
|
|
||||||
"image/gif": ".gif",
|
|
||||||
"image/webp": ".webp",
|
|
||||||
}
|
|
||||||
self.assertEqual(parser.supported_mime_types(), expected_types)
|
|
||||||
|
|
||||||
def test_supported_mime_types_invalid_config(self):
|
|
||||||
parser = get_parser(uuid.uuid4())
|
|
||||||
self.assertEqual(parser.supported_mime_types(), {})
|
|
||||||
|
|
||||||
@override_settings(
|
|
||||||
REMOTE_OCR_ENGINE=None,
|
|
||||||
REMOTE_OCR_API_KEY=None,
|
|
||||||
REMOTE_OCR_ENDPOINT=None,
|
|
||||||
)
|
|
||||||
def test_parse_with_invalid_config(self):
|
|
||||||
parser = get_parser(uuid.uuid4())
|
|
||||||
parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf")
|
|
||||||
self.assertEqual(parser.text, "")
|
|
47
uv.lock
generated
47
uv.lock
generated
@@ -95,34 +95,6 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/af/cc/55a32a2c98022d88812b5986d2a92c4ff3ee087e83b712ebc703bba452bf/Automat-24.8.1-py3-none-any.whl", hash = "sha256:bf029a7bc3da1e2c24da2343e7598affaa9f10bf0ab63ff808566ce90551e02a", size = 42585, upload-time = "2024-08-19T17:31:56.729Z" },
|
{ url = "https://files.pythonhosted.org/packages/af/cc/55a32a2c98022d88812b5986d2a92c4ff3ee087e83b712ebc703bba452bf/Automat-24.8.1-py3-none-any.whl", hash = "sha256:bf029a7bc3da1e2c24da2343e7598affaa9f10bf0ab63ff808566ce90551e02a", size = 42585, upload-time = "2024-08-19T17:31:56.729Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "azure-ai-documentintelligence"
|
|
||||||
version = "1.0.2"
|
|
||||||
source = { registry = "https://pypi.org/simple" }
|
|
||||||
dependencies = [
|
|
||||||
{ name = "azure-core", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
|
||||||
{ name = "isodate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
|
||||||
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
|
||||||
]
|
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/44/7b/8115cd713e2caa5e44def85f2b7ebd02a74ae74d7113ba20bdd41fd6dd80/azure_ai_documentintelligence-1.0.2.tar.gz", hash = "sha256:4d75a2513f2839365ebabc0e0e1772f5601b3a8c9a71e75da12440da13b63484", size = 170940 }
|
|
||||||
wheels = [
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/d9/75/c9ec040f23082f54ffb1977ff8f364c2d21c79a640a13d1c1809e7fd6b1a/azure_ai_documentintelligence-1.0.2-py3-none-any.whl", hash = "sha256:e1fb446abbdeccc9759d897898a0fe13141ed29f9ad11fc705f951925822ed59", size = 106005 },
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "azure-core"
|
|
||||||
version = "1.33.0"
|
|
||||||
source = { registry = "https://pypi.org/simple" }
|
|
||||||
dependencies = [
|
|
||||||
{ name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
|
||||||
{ name = "six", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
|
||||||
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
|
||||||
]
|
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/75/aa/7c9db8edd626f1a7d99d09ef7926f6f4fb34d5f9fa00dc394afdfe8e2a80/azure_core-1.33.0.tar.gz", hash = "sha256:f367aa07b5e3005fec2c1e184b882b0b039910733907d001c20fb08ebb8c0eb9", size = 295633 }
|
|
||||||
wheels = [
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/07/b7/76b7e144aa53bd206bf1ce34fa75350472c3f69bf30e5c8c18bc9881035d/azure_core-1.33.0-py3-none-any.whl", hash = "sha256:9b5b6d0223a1d38c37500e6971118c1e0f13f54951e6893968b38910bc9cda8f", size = 207071 },
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "babel"
|
name = "babel"
|
||||||
version = "2.17.0"
|
version = "2.17.0"
|
||||||
@@ -810,15 +782,15 @@ wheels = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "django-guardian"
|
name = "django-guardian"
|
||||||
version = "3.1.3"
|
version = "3.2.0"
|
||||||
source = { registry = "https://pypi.org/simple" }
|
source = { registry = "https://pypi.org/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "django", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "django", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "typing-extensions", marker = "(python_full_version < '3.13' and sys_platform == 'darwin') or (python_full_version < '3.13' and sys_platform == 'linux')" },
|
{ name = "typing-extensions", marker = "(python_full_version < '3.13' and sys_platform == 'darwin') or (python_full_version < '3.13' and sys_platform == 'linux')" },
|
||||||
]
|
]
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/81/d3/436a44c7688fce1a978224c349ba66c95bf9103d548596b7a2694fd58c03/django_guardian-3.1.3.tar.gz", hash = "sha256:12b5e66c18c97088b0adfa033ab14be68c321c170fd3ec438898271f00a71699", size = 93571, upload-time = "2025-09-10T08:36:23.928Z" }
|
sdist = { url = "https://files.pythonhosted.org/packages/e2/f9/bcff6a931298b9eb55e1550b55ab964fab747f594ba6d2d81cbe19736c5f/django_guardian-3.2.0.tar.gz", hash = "sha256:9e18ecd2e211b665972690c2d03d27bce0ea4932b5efac24a4bb9d526950a69e", size = 99940, upload-time = "2025-09-16T10:35:53.609Z" }
|
||||||
wheels = [
|
wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/83/fc/6fd7b8bc7c52cbbfd1714673cfd28ff0b3fae32265c52d492ec0dee22cb8/django_guardian-3.1.3-py3-none-any.whl", hash = "sha256:90e28b40eea65c326a3a961908cc300f9e1cd69b74e88d38317a9befa167b71c", size = 127687, upload-time = "2025-09-10T08:36:22.533Z" },
|
{ url = "https://files.pythonhosted.org/packages/2f/23/63a7d868373a73d25c4a5c2dd3cce3aaeb22fbee82560d42b6e93ba01403/django_guardian-3.2.0-py3-none-any.whl", hash = "sha256:0768565a057988a93fc4a1d93649c4a794abfd7473a8408a079cfbf83c559d77", size = 134674, upload-time = "2025-09-16T10:35:51.69Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -1440,15 +1412,6 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/c7/fc/4e5a141c3f7c7bed550ac1f69e599e92b6be449dd4677ec09f325cad0955/inotifyrecursive-0.3.5-py3-none-any.whl", hash = "sha256:7e5f4a2e1dc2bef0efa3b5f6b339c41fb4599055a2b54909d020e9e932cc8d2f", size = 8009, upload-time = "2020-11-20T12:38:46.981Z" },
|
{ url = "https://files.pythonhosted.org/packages/c7/fc/4e5a141c3f7c7bed550ac1f69e599e92b6be449dd4677ec09f325cad0955/inotifyrecursive-0.3.5-py3-none-any.whl", hash = "sha256:7e5f4a2e1dc2bef0efa3b5f6b339c41fb4599055a2b54909d020e9e932cc8d2f", size = 8009, upload-time = "2020-11-20T12:38:46.981Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "isodate"
|
|
||||||
version = "0.7.2"
|
|
||||||
source = { registry = "https://pypi.org/simple" }
|
|
||||||
sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705 }
|
|
||||||
wheels = [
|
|
||||||
{ url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320 },
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "jinja2"
|
name = "jinja2"
|
||||||
version = "3.1.6"
|
version = "3.1.6"
|
||||||
@@ -2069,7 +2032,6 @@ name = "paperless-ngx"
|
|||||||
version = "2.18.4"
|
version = "2.18.4"
|
||||||
source = { virtual = "." }
|
source = { virtual = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
|
||||||
{ name = "babel", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "babel", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
{ name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
{ name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
|
||||||
@@ -2207,7 +2169,6 @@ typing = [
|
|||||||
|
|
||||||
[package.metadata]
|
[package.metadata]
|
||||||
requires-dist = [
|
requires-dist = [
|
||||||
{ name = "azure-ai-documentintelligence", specifier = ">=1.0.2" },
|
|
||||||
{ name = "babel", specifier = ">=2.17" },
|
{ name = "babel", specifier = ">=2.17" },
|
||||||
{ name = "bleach", specifier = "~=6.2.0" },
|
{ name = "bleach", specifier = "~=6.2.0" },
|
||||||
{ name = "celery", extras = ["redis"], specifier = "~=5.5.1" },
|
{ name = "celery", extras = ["redis"], specifier = "~=5.5.1" },
|
||||||
@@ -2224,7 +2185,7 @@ requires-dist = [
|
|||||||
{ name = "django-cors-headers", specifier = "~=4.8.0" },
|
{ name = "django-cors-headers", specifier = "~=4.8.0" },
|
||||||
{ name = "django-extensions", specifier = "~=4.1" },
|
{ name = "django-extensions", specifier = "~=4.1" },
|
||||||
{ name = "django-filter", specifier = "~=25.1" },
|
{ name = "django-filter", specifier = "~=25.1" },
|
||||||
{ name = "django-guardian", specifier = "~=3.1.2" },
|
{ name = "django-guardian", specifier = "~=3.2.0" },
|
||||||
{ name = "django-multiselectfield", specifier = "~=1.0.1" },
|
{ name = "django-multiselectfield", specifier = "~=1.0.1" },
|
||||||
{ name = "django-soft-delete", specifier = "~=1.0.18" },
|
{ name = "django-soft-delete", specifier = "~=1.0.18" },
|
||||||
{ name = "django-treenode", specifier = ">=0.23.2" },
|
{ name = "django-treenode", specifier = ">=0.23.2" },
|
||||||
|
Reference in New Issue
Block a user