Compare commits

..

25 Commits

Author SHA1 Message Date
shamoon
c54073b7c2 Merge branch 'dev' into feature-remote-ocr-2 2025-09-04 09:16:59 -07:00
shamoon
247e6f39dc Merge branch 'dev' into feature-remote-ocr-2 2025-09-01 20:10:40 -07:00
shamoon
1e6dfc4481 Merge branch 'dev' into feature-remote-ocr-2 2025-08-26 13:30:39 -07:00
shamoon
7cc0750066 Add note on costs and limitations for Azure OCR 2025-08-24 05:47:07 -07:00
shamoon
bd6585d3b4 Merge branch 'dev' into feature-remote-ocr-2 2025-08-22 08:54:26 -07:00
shamoon
717e828a1d Merge branch 'dev' into feature-remote-ocr-2 2025-08-17 21:25:14 -07:00
shamoon
07381d48e6 Merge branch 'dev' into feature-remote-ocr-2 2025-08-17 07:49:58 -07:00
shamoon
dd0ffaf312 Merge branch 'dev' into feature-remote-ocr-2 2025-08-11 10:48:36 -07:00
shamoon
264504affc Fix consumer declaration file extensions 2025-08-10 05:32:52 -07:00
shamoon
4feedf2add Merge branch 'dev' into feature-remote-ocr-2 2025-08-06 16:04:25 -04:00
shamoon
2f76cf9831 Merge branch 'dev' into feature-remote-ocr-2 2025-08-01 23:55:49 -04:00
shamoon
1002d37f6b Update test_parser.py 2025-07-09 11:05:37 -07:00
shamoon
d260a94740 Update parsers.py 2025-07-09 11:02:57 -07:00
shamoon
88c69b83ea Update index.md 2025-07-09 11:00:12 -07:00
shamoon
2557ee2014 Update docs to mention remote OCR with Azure AI 2025-07-09 09:53:30 -07:00
shamoon
3c75deed80 Add paperless_remote tests to testpaths 2025-07-08 14:19:45 -07:00
shamoon
d05343c927 Test fixes / coverage 2025-07-08 14:19:45 -07:00
shamoon
e7972b7eaf Coverage 2025-07-08 14:19:45 -07:00
shamoon
75a091cc0d Fix test 2025-07-08 14:19:44 -07:00
shamoon
dca74803fd Use output_content_format poller.result to get clean content 2025-07-08 14:19:44 -07:00
shamoon
3cf3d868d0 Some docs 2025-07-08 14:19:43 -07:00
shamoon
bf4fc6604a Test 2025-07-08 14:19:43 -07:00
shamoon
e8c1eb86fa This actually works
[ci skip]
2025-07-08 14:19:43 -07:00
shamoon
c3dad3cf69 Basic parse 2025-07-08 14:19:42 -07:00
shamoon
811bd66088 Ok, restart implementing this with just azure
[ci skip]
2025-07-08 14:19:42 -07:00
16 changed files with 382 additions and 388 deletions

View File

@@ -1800,3 +1800,23 @@ password. All of these options come from their similarly-named [Django settings]
#### [`PAPERLESS_EMAIL_USE_SSL=<bool>`](#PAPERLESS_EMAIL_USE_SSL) {#PAPERLESS_EMAIL_USE_SSL}
: Defaults to false.
## Remote OCR
#### [`PAPERLESS_REMOTE_OCR_ENGINE=<str>`](#PAPERLESS_REMOTE_OCR_ENGINE) {#PAPERLESS_REMOTE_OCR_ENGINE}
: The remote OCR engine to use. Currently only Azure AI is supported as "azureai".
Defaults to None, which disables remote OCR.
#### [`PAPERLESS_REMOTE_OCR_API_KEY=<str>`](#PAPERLESS_REMOTE_OCR_API_KEY) {#PAPERLESS_REMOTE_OCR_API_KEY}
: The API key to use for the remote OCR engine.
Defaults to None.
#### [`PAPERLESS_REMOTE_OCR_ENDPOINT=<str>`](#PAPERLESS_REMOTE_OCR_ENDPOINT) {#PAPERLESS_REMOTE_OCR_ENDPOINT}
: The endpoint to use for the remote OCR engine. This is required for Azure AI.
Defaults to None.

View File

@@ -25,9 +25,10 @@ physical documents into a searchable online archive so you can keep, well, _less
## Features
- **Organize and index** your scanned documents with tags, correspondents, types, and more.
- _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way.
- _Your_ data is stored locally on _your_ server and is never transmitted or shared in any way, unless you explicitly choose to do so.
- Performs **OCR** on your documents, adding searchable and selectable text, even to documents scanned with only images.
- Utilizes the open-source Tesseract engine to recognize more than 100 languages.
- Utilizes the open-source Tesseract engine to recognize more than 100 languages.
- _New!_ Supports remote OCR with Azure AI (opt-in).
- Documents are saved as PDF/A format which is designed for long term storage, alongside the unaltered originals.
- Uses machine-learning to automatically add tags, correspondents and document types to your documents.
- Supports PDF documents, images, plain text files, Office documents (Word, Excel, PowerPoint, and LibreOffice equivalents)[^1] and more.

View File

@@ -850,6 +850,21 @@ how regularly you intend to scan documents and use paperless.
performed the task associated with the document, move it to the
inbox.
## Remote OCR
!!! important
This feature is disabled by default and will always remain strictly "opt-in".
Paperless-ngx supports performing OCR on documents using remote services. At the moment, this is limited to
[Microsoft's Azure "Document Intelligence" service](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence).
This is of course a paid service (with a free tier) which requires an Azure account and subscription. Azure AI is not affiliated with
Paperless-ngx in any way. When enabled, Paperless-ngx will automatically send appropriate documents to Azure for OCR processing, bypassing
the local OCR engine. See the [configuration](configuration.md#PAPERLESS_REMOTE_OCR_ENGINE) options for more details.
Additionally, when using a commercial service with this feature, consider both potential costs as well as any associated file size
or page limitations (e.g. with a free tier).
## Architecture
Paperless-ngx consists of the following components:

View File

@@ -15,6 +15,7 @@ classifiers = [
# This will allow testing to not install a webserver, mysql, etc
dependencies = [
"azure-ai-documentintelligence>=1.0.2",
"babel>=2.17",
"bleach~=6.2.0",
"celery[redis]~=5.5.1",
@@ -230,6 +231,7 @@ testpaths = [
"src/paperless_tesseract/tests/",
"src/paperless_tika/tests",
"src/paperless_text/tests/",
"src/paperless_remote/tests/",
]
addopts = [
"--pythonwarnings=all",

View File

@@ -322,6 +322,7 @@ INSTALLED_APPS = [
"paperless_tesseract.apps.PaperlessTesseractConfig",
"paperless_text.apps.PaperlessTextConfig",
"paperless_mail.apps.PaperlessMailConfig",
"paperless_remote.apps.PaperlessRemoteParserConfig",
"django.contrib.admin",
"rest_framework",
"rest_framework.authtoken",
@@ -425,7 +426,7 @@ WHITENOISE_STATIC_PREFIX = "/static/"
if machine().lower() == "aarch64": # pragma: no cover
_static_backend = "django.contrib.staticfiles.storage.StaticFilesStorage"
else:
_static_backend = "paperless.staticfiles.DeduplicatedCompressedStaticFilesStorage"
_static_backend = "whitenoise.storage.CompressedStaticFilesStorage"
STORAGES = {
"staticfiles": {
@@ -1388,3 +1389,10 @@ WEBHOOKS_ALLOW_INTERNAL_REQUESTS = __get_boolean(
"PAPERLESS_WEBHOOKS_ALLOW_INTERNAL_REQUESTS",
"true",
)
###############################################################################
# Remote Parser #
###############################################################################
REMOTE_OCR_ENGINE = os.getenv("PAPERLESS_REMOTE_OCR_ENGINE")
REMOTE_OCR_API_KEY = os.getenv("PAPERLESS_REMOTE_OCR_API_KEY")
REMOTE_OCR_ENDPOINT = os.getenv("PAPERLESS_REMOTE_OCR_ENDPOINT")

View File

@@ -1,385 +0,0 @@
import gzip
import hashlib
import logging
import os
import shutil
import threading
import time
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed
from dataclasses import dataclass
from pathlib import Path
import brotli
import humanize
from django.contrib.staticfiles.storage import StaticFilesStorage
logger = logging.getLogger(__name__)
@dataclass(slots=True)
class FileInfo:
file_path_str: str
file_path_path: Path
checksum: str
original_size: int
gzip_size: int | None = None
brotli_size: int | None = None
class DeduplicatedCompressedStaticFilesStorage(StaticFilesStorage):
# File extensions that should be compressed
COMPRESSIBLE_EXTENSIONS = {
".css",
".js",
".html",
".htm",
".xml",
".json",
".txt",
".svg",
".md",
".rst",
".csv",
".tsv",
".yaml",
".yml",
".map",
}
# Minimum file size to compress (bytes)
MIN_COMPRESS_SIZE = 1024 # 1KB
# Maximum number of threads for parallel processing
MAX_WORKERS = min(32, (os.cpu_count() or 1) + 4)
# Chunk size for file reading
CHUNK_SIZE = 64 * 1024 # 64KB
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# --- MODIFIED: Added path_to_file_info for easy lookup ---
self.hash_to_files: dict[str, list[FileInfo]] = defaultdict(list)
self.path_to_file_info: dict[str, FileInfo] = {}
self.linked_files: set[Path] = set()
self.compression_stats = {
"brotli": 0,
"gzip": 0,
"skipped_linked": 0,
"skipped_other": 0,
"errors": 0,
}
self._lock = threading.Lock()
def post_process(self, paths: list[str], **options):
"""
Post-process collected files: deduplicate first, then compress.
Django 5.2 compatible with proper options handling.
"""
start_time = time.time()
# Step 1: Build hash map for deduplication (parallel)
self._build_file_hash_map_parallel(paths)
# Step 2: Create hard links for duplicate files
self._create_hard_links()
# Step 3: Compress files (parallel, skip linked duplicates)
self._compress_files_parallel(paths)
# Step 4: Provide user a summary of the compression
self._log_compression_summary()
processing_time = time.time() - start_time
logger.info(f"Post-processing complete in {processing_time:.2f}s.")
# Return list of processed files
processed_files = []
for path in paths:
processed_files.append((path, path, True))
# Add compressed variants
file_path = self.path(path)
if Path(file_path + ".br").exists():
processed_files.append((path + ".br", path + ".br", True))
if Path(file_path + ".gz").exists():
processed_files.append((path + ".gz", path + ".gz", True))
return processed_files
def _build_file_hash_map_parallel(self, file_paths: list[str]):
"""Build a map of file hashes using parallel processing."""
logger.info(
f"Hashing {len(file_paths)} files with {self.MAX_WORKERS} workers...",
)
def hash_file(path: str):
"""Hash a single file."""
try:
file_path = Path(self.path(path))
if not file_path.is_file():
return None, None, None
file_hash = self._get_file_hash_fast(file_path)
file_size = file_path.stat().st_size
return path, file_hash, file_size
except Exception as e:
logger.warning(f"Error hashing file {path}: {e}")
return path, None, None
with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
future_to_path = {
executor.submit(hash_file, path): path for path in file_paths
}
for future in as_completed(future_to_path):
path, file_hash, file_size = future.result()
if path is not None and file_hash is not None and file_size is not None:
with self._lock:
file_info = FileInfo(
file_path_str=path,
file_path_path=Path(self.path(path)),
checksum=file_hash,
original_size=file_size,
)
self.hash_to_files[file_hash].append(file_info)
self.path_to_file_info[path] = file_info
duplicates = sum(1 for files in self.hash_to_files.values() if len(files) > 1)
logger.info(f"Found {duplicates} sets of duplicate files")
def _get_file_hash_fast(self, file_path: Path):
"""Calculate SHA-256 hash of file content with optimized reading."""
hash_sha256 = hashlib.sha256()
try:
with file_path.open("rb") as f:
while chunk := f.read(self.CHUNK_SIZE):
hash_sha256.update(chunk)
except OSError as e:
logger.warning(f"Could not read file {file_path}: {e}")
raise
return hash_sha256.hexdigest()
def _create_hard_links(self):
"""Create hard links for duplicate files."""
logger.info("Creating hard links for duplicate files...")
linked_count = 0
for file_info_list in self.hash_to_files.values():
if len(file_info_list) <= 1:
continue
# Sort by file size (desc) then path length (asc) to keep best original
file_info_list.sort(key=lambda x: (-x.original_size, len(x.file_path_str)))
original_file_info = file_info_list[0]
duplicate_info = file_info_list[1:]
for duplicate_file_info in duplicate_info:
try:
# Remove duplicate file and create hard link
if duplicate_file_info.file_path_path.exists():
duplicate_file_info.file_path_path.unlink()
# Create hard link
os.link(
original_file_info.file_path_path,
duplicate_file_info.file_path_path,
)
with self._lock:
self.linked_files.add(duplicate_file_info.file_path_path)
linked_count += 1
logger.info(
f"Linked {duplicate_file_info.file_path_path} -> {original_file_info.file_path_path}",
)
except OSError as e:
logger.error(
f"Hard link failed for {original_file_info.file_path_path}, copying instead: {e}",
)
# Fall back to copying if hard linking fails
try:
import shutil
shutil.copy2(
original_file_info.file_path_path,
original_file_info.file_path_path,
)
logger.error(
f"Copied {original_file_info.file_path_path} (hard link failed)",
)
except Exception as copy_error:
logger.error(
f"Failed to copy {original_file_info.file_path_path}: {copy_error}",
)
if linked_count > 0:
logger.info(f"Created {linked_count} hard links")
def _compress_files_parallel(self, file_paths: list[str]):
"""Compress files using parallel processing and update FileInfo objects."""
# Identify files to compress, excluding hard links
compressible_files = [
self.path_to_file_info[path]
for path in file_paths
if self.path_to_file_info[path].file_path_path not in self.linked_files
and self._should_compress_file(path)
]
if not compressible_files:
logger.info("No new files to compress")
return
logger.info(
f"Compressing {len(compressible_files)} files with {self.MAX_WORKERS} workers...",
)
def compress_file(file_info: FileInfo):
"""Compress a single file and update its FileInfo by side-effect."""
brotli_size = None
gzip_size = None
error = None
try:
brotli_size = self._compress_file_brotli(str(file_info.file_path_path))
gzip_size = self._compress_file_gzip(str(file_info.file_path_path))
# Store the compressed sizes
file_info.brotli_size = brotli_size
file_info.gzip_size = gzip_size
except Exception as e:
error = str(e)
logger.warning(f"Error compressing {file_info.file_path_str}: {e}")
return {
"brotli": brotli_size is not None,
"gzip": gzip_size is not None,
"error": error,
}
with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
future_to_info = {
executor.submit(compress_file, info): info
for info in compressible_files
}
for future in as_completed(future_to_info):
result = future.result()
with self._lock:
if result["brotli"]:
self.compression_stats["brotli"] += 1
if result["gzip"]:
self.compression_stats["gzip"] += 1
if result["error"]:
self.compression_stats["errors"] += 1
if (
not result["brotli"]
and not result["gzip"]
and not result["error"]
):
self.compression_stats["skipped_other"] += 1
self.compression_stats["skipped_linked"] = len(self.linked_files)
logger.info(f"File count stats: {self.compression_stats}")
def _should_compress_file(self, path: str):
"""Determine if a file should be compressed."""
file_ext = Path(path).suffix.lower()
if file_ext not in self.COMPRESSIBLE_EXTENSIONS:
return False
try:
if Path(self.path(path)).stat().st_size < self.MIN_COMPRESS_SIZE:
return False
except OSError:
return False
return True
def _compress_file_brotli(self, file_path: str) -> int | None:
"""Compress file using Brotli, returns compressed size or None."""
brotli_path = Path(file_path + ".br")
try:
with Path(file_path).open("rb") as f_in:
original_data = f_in.read()
compressed_data = brotli.compress(
original_data,
quality=10,
lgwin=22, # Window size
lgblock=0, # Auto block size
)
if len(compressed_data) < len(original_data) * 0.95:
with brotli_path.open("wb") as f_out:
f_out.write(compressed_data)
return len(compressed_data)
return None
except Exception as e:
logger.warning(f"Brotli compression failed for {file_path}: {e}")
return None
def _compress_file_gzip(self, file_path: str) -> int | None:
"""Compress file using GZip, returns compressed size or None."""
gzip_path = Path(file_path + ".gz")
file_path_path = Path(file_path)
try:
original_size = file_path_path.stat().st_size
with (
file_path_path.open("rb") as f_in,
gzip.open(
gzip_path,
"wb",
compresslevel=7,
) as f_out,
):
shutil.copyfileobj(f_in, f_out, length=self.CHUNK_SIZE)
compressed_size = gzip_path.stat().st_size
if compressed_size < original_size * 0.95:
return compressed_size
else:
gzip_path.unlink()
return None
except Exception as e:
logger.warning(f"GZip compression failed for {file_path}: {e}")
if gzip_path.exists():
try:
gzip_path.unlink()
except OSError:
pass
return None
def _log_compression_summary(self):
"""Calculates and logs the total size savings from compression."""
total_original_size = 0
total_brotli_size = 0
total_gzip_size = 0
# Only consider the original files, not the duplicates, for size calculation
unique_files = {
file_list[0].checksum: file_list[0]
for file_list in self.hash_to_files.values()
}
for file_info in unique_files.values():
if self._should_compress_file(file_info.file_path_str):
total_original_size += file_info.original_size
if file_info.brotli_size:
total_brotli_size += file_info.brotli_size
if file_info.gzip_size:
total_gzip_size += file_info.gzip_size
def get_savings(original: int, compressed: int) -> str:
if original == 0:
return "0.00%"
return f"{(1 - compressed / original) * 100:.2f}%"
logger.info(
f"Total Original Size (compressible files): {humanize.naturalsize(total_original_size)}",
)
if total_brotli_size > 0:
logger.info(
f"Total Brotli Size: {humanize.naturalsize(total_brotli_size)} "
f"(Savings: {get_savings(total_original_size, total_brotli_size)})",
)
if total_gzip_size > 0:
logger.info(
f"Total Gzip Size: {humanize.naturalsize(total_gzip_size)} "
f"(Savings: {get_savings(total_original_size, total_gzip_size)})",
)

View File

@@ -0,0 +1,4 @@
# this is here so that django finds the checks.
from paperless_remote.checks import check_remote_parser_configured
__all__ = ["check_remote_parser_configured"]

View File

@@ -0,0 +1,14 @@
from django.apps import AppConfig
from paperless_remote.signals import remote_consumer_declaration
class PaperlessRemoteParserConfig(AppConfig):
name = "paperless_remote"
def ready(self):
from documents.signals import document_consumer_declaration
document_consumer_declaration.connect(remote_consumer_declaration)
AppConfig.ready(self)

View File

@@ -0,0 +1,15 @@
from django.conf import settings
from django.core.checks import Error
from django.core.checks import register
@register()
def check_remote_parser_configured(app_configs, **kwargs):
if settings.REMOTE_OCR_ENGINE == "azureai" and not settings.REMOTE_OCR_ENDPOINT:
return [
Error(
"Azure AI remote parser requires endpoint to be configured.",
),
]
return []

View File

@@ -0,0 +1,113 @@
from pathlib import Path
from django.conf import settings
from paperless_tesseract.parsers import RasterisedDocumentParser
class RemoteEngineConfig:
def __init__(
self,
engine: str,
api_key: str | None = None,
endpoint: str | None = None,
):
self.engine = engine
self.api_key = api_key
self.endpoint = endpoint
def engine_is_valid(self):
valid = self.engine in ["azureai"] and self.api_key is not None
if self.engine == "azureai":
valid = valid and self.endpoint is not None
return valid
class RemoteDocumentParser(RasterisedDocumentParser):
"""
This parser uses a remote OCR engine to parse documents. Currently, it supports Azure AI Vision
as this is the only service that provides a remote OCR API with text-embedded PDF output.
"""
logging_name = "paperless.parsing.remote"
def get_settings(self) -> RemoteEngineConfig:
"""
Returns the configuration for the remote OCR engine, loaded from Django settings.
"""
return RemoteEngineConfig(
engine=settings.REMOTE_OCR_ENGINE,
api_key=settings.REMOTE_OCR_API_KEY,
endpoint=settings.REMOTE_OCR_ENDPOINT,
)
def supported_mime_types(self):
if self.settings.engine_is_valid():
return {
"application/pdf": ".pdf",
"image/png": ".png",
"image/jpeg": ".jpg",
"image/tiff": ".tiff",
"image/bmp": ".bmp",
"image/gif": ".gif",
"image/webp": ".webp",
}
else:
return {}
def azure_ai_vision_parse(
self,
file: Path,
) -> str | None:
"""
Uses Azure AI Vision to parse the document and return the text content.
It requests a searchable PDF output with embedded text.
The PDF is saved to the archive_path attribute.
Returns the text content extracted from the document.
If the parsing fails, it returns None.
"""
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from azure.ai.documentintelligence.models import AnalyzeOutputOption
from azure.ai.documentintelligence.models import DocumentContentFormat
from azure.core.credentials import AzureKeyCredential
client = DocumentIntelligenceClient(
endpoint=self.settings.endpoint,
credential=AzureKeyCredential(self.settings.api_key),
)
with file.open("rb") as f:
analyze_request = AnalyzeDocumentRequest(bytes_source=f.read())
poller = client.begin_analyze_document(
model_id="prebuilt-read",
body=analyze_request,
output_content_format=DocumentContentFormat.TEXT,
output=[AnalyzeOutputOption.PDF], # request searchable PDF output
content_type="application/json",
)
poller.wait()
result_id = poller.details["operation_id"]
result = poller.result()
# Download the PDF with embedded text
self.archive_path = Path(self.tempdir) / "archive.pdf"
with self.archive_path.open("wb") as f:
for chunk in client.get_analyze_result_pdf(
model_id="prebuilt-read",
result_id=result_id,
):
f.write(chunk)
return result.content
def parse(self, document_path: Path, mime_type, file_name=None):
if not self.settings.engine_is_valid():
self.log.warning(
"No valid remote parser engine is configured, content will be empty.",
)
self.text = ""
return
elif self.settings.engine == "azureai":
self.text = self.azure_ai_vision_parse(document_path)

View File

@@ -0,0 +1,18 @@
def get_parser(*args, **kwargs):
from paperless_remote.parsers import RemoteDocumentParser
return RemoteDocumentParser(*args, **kwargs)
def get_supported_mime_types():
from paperless_remote.parsers import RemoteDocumentParser
return RemoteDocumentParser(None).supported_mime_types()
def remote_consumer_declaration(sender, **kwargs):
return {
"parser": get_parser,
"weight": 5,
"mime_types": get_supported_mime_types(),
}

View File

Binary file not shown.

View File

@@ -0,0 +1,29 @@
from django.test import TestCase
from django.test import override_settings
from paperless_remote import check_remote_parser_configured
class TestChecks(TestCase):
@override_settings(REMOTE_OCR_ENGINE=None)
def test_no_engine(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 0)
@override_settings(REMOTE_OCR_ENGINE="azureai")
@override_settings(REMOTE_OCR_API_KEY="somekey")
@override_settings(REMOTE_OCR_ENDPOINT=None)
def test_azure_no_endpoint(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 1)
self.assertTrue(
msgs[0].msg.startswith(
"Azure AI remote parser requires endpoint to be configured.",
),
)
@override_settings(REMOTE_OCR_ENGINE="something")
@override_settings(REMOTE_OCR_API_KEY="somekey")
def test_valid_configuration(self):
msgs = check_remote_parser_configured(None)
self.assertEqual(len(msgs), 0)

View File

@@ -0,0 +1,101 @@
import uuid
from pathlib import Path
from unittest import mock
from django.test import TestCase
from django.test import override_settings
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
from paperless_remote.parsers import RemoteDocumentParser
from paperless_remote.signals import get_parser
class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = Path(__file__).resolve().parent / "samples"
def assertContainsStrings(self, content, strings):
# Asserts that all strings appear in content, in the given order.
indices = []
for s in strings:
if s in content:
indices.append(content.index(s))
else:
self.fail(f"'{s}' is not in '{content}'")
self.assertListEqual(indices, sorted(indices))
@mock.patch("paperless_tesseract.parsers.run_subprocess")
@mock.patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
def test_get_text_with_azure(self, mock_client_cls, mock_subprocess):
# Arrange mock Azure client
mock_client = mock.Mock()
mock_client_cls.return_value = mock_client
# Simulate poller result and its `.details`
mock_poller = mock.Mock()
mock_poller.wait.return_value = None
mock_poller.details = {"operation_id": "fake-op-id"}
mock_client.begin_analyze_document.return_value = mock_poller
mock_poller.result.return_value.content = "This is a test document."
# Return dummy PDF bytes
mock_client.get_analyze_result_pdf.return_value = [
b"%PDF-",
b"1.7 ",
b"FAKEPDF",
]
# Simulate pdftotext by writing dummy text to sidecar file
def fake_run(cmd, *args, **kwargs):
with Path(cmd[-1]).open("w", encoding="utf-8") as f:
f.write("This is a test document.")
mock_subprocess.side_effect = fake_run
with override_settings(
REMOTE_OCR_ENGINE="azureai",
REMOTE_OCR_API_KEY="somekey",
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
):
parser = get_parser(uuid.uuid4())
parser.parse(
self.SAMPLE_FILES / "simple-digital.pdf",
"application/pdf",
)
self.assertContainsStrings(
parser.text.strip(),
["This is a test document."],
)
@override_settings(
REMOTE_OCR_ENGINE="azureai",
REMOTE_OCR_API_KEY="key",
REMOTE_OCR_ENDPOINT="https://endpoint.cognitiveservices.azure.com",
)
def test_supported_mime_types_valid_config(self):
parser = RemoteDocumentParser(uuid.uuid4())
expected_types = {
"application/pdf": ".pdf",
"image/png": ".png",
"image/jpeg": ".jpg",
"image/tiff": ".tiff",
"image/bmp": ".bmp",
"image/gif": ".gif",
"image/webp": ".webp",
}
self.assertEqual(parser.supported_mime_types(), expected_types)
def test_supported_mime_types_invalid_config(self):
parser = get_parser(uuid.uuid4())
self.assertEqual(parser.supported_mime_types(), {})
@override_settings(
REMOTE_OCR_ENGINE=None,
REMOTE_OCR_API_KEY=None,
REMOTE_OCR_ENDPOINT=None,
)
def test_parse_with_invalid_config(self):
parser = get_parser(uuid.uuid4())
parser.parse(self.SAMPLE_FILES / "simple-digital.pdf", "application/pdf")
self.assertEqual(parser.text, "")

39
uv.lock generated
View File

@@ -95,6 +95,34 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/af/cc/55a32a2c98022d88812b5986d2a92c4ff3ee087e83b712ebc703bba452bf/Automat-24.8.1-py3-none-any.whl", hash = "sha256:bf029a7bc3da1e2c24da2343e7598affaa9f10bf0ab63ff808566ce90551e02a", size = 42585, upload-time = "2024-08-19T17:31:56.729Z" },
]
[[package]]
name = "azure-ai-documentintelligence"
version = "1.0.2"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "azure-core", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "isodate", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/44/7b/8115cd713e2caa5e44def85f2b7ebd02a74ae74d7113ba20bdd41fd6dd80/azure_ai_documentintelligence-1.0.2.tar.gz", hash = "sha256:4d75a2513f2839365ebabc0e0e1772f5601b3a8c9a71e75da12440da13b63484", size = 170940 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/d9/75/c9ec040f23082f54ffb1977ff8f364c2d21c79a640a13d1c1809e7fd6b1a/azure_ai_documentintelligence-1.0.2-py3-none-any.whl", hash = "sha256:e1fb446abbdeccc9759d897898a0fe13141ed29f9ad11fc705f951925822ed59", size = 106005 },
]
[[package]]
name = "azure-core"
version = "1.33.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "requests", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "six", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "typing-extensions", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/75/aa/7c9db8edd626f1a7d99d09ef7926f6f4fb34d5f9fa00dc394afdfe8e2a80/azure_core-1.33.0.tar.gz", hash = "sha256:f367aa07b5e3005fec2c1e184b882b0b039910733907d001c20fb08ebb8c0eb9", size = 295633 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/07/b7/76b7e144aa53bd206bf1ce34fa75350472c3f69bf30e5c8c18bc9881035d/azure_core-1.33.0-py3-none-any.whl", hash = "sha256:9b5b6d0223a1d38c37500e6971118c1e0f13f54951e6893968b38910bc9cda8f", size = 207071 },
]
[[package]]
name = "babel"
version = "2.17.0"
@@ -1402,6 +1430,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/c7/fc/4e5a141c3f7c7bed550ac1f69e599e92b6be449dd4677ec09f325cad0955/inotifyrecursive-0.3.5-py3-none-any.whl", hash = "sha256:7e5f4a2e1dc2bef0efa3b5f6b339c41fb4599055a2b54909d020e9e932cc8d2f", size = 8009, upload-time = "2020-11-20T12:38:46.981Z" },
]
[[package]]
name = "isodate"
version = "0.7.2"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320 },
]
[[package]]
name = "jinja2"
version = "3.1.6"
@@ -2010,6 +2047,7 @@ name = "paperless-ngx"
version = "2.18.3"
source = { virtual = "." }
dependencies = [
{ name = "azure-ai-documentintelligence", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "babel", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "bleach", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "celery", extra = ["redis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -2144,6 +2182,7 @@ typing = [
[package.metadata]
requires-dist = [
{ name = "azure-ai-documentintelligence", specifier = ">=1.0.2" },
{ name = "babel", specifier = ">=2.17" },
{ name = "bleach", specifier = "~=6.2.0" },
{ name = "celery", extras = ["redis"], specifier = "~=5.5.1" },