Merge remote-tracking branch 'paperless-ngx/dev' into dev

This commit is contained in:
Trenton Holmes
2023-08-03 10:00:14 -07:00
93 changed files with 4444 additions and 5187 deletions

View File

@@ -1,15 +1,12 @@
import logging
import shutil
import tempfile
from dataclasses import dataclass
from pathlib import Path
from subprocess import run
from typing import Dict
from typing import Final
from typing import List
from typing import Optional
import img2pdf
from django.conf import settings
from pdf2image import convert_from_path
from pdf2image.exceptions import PDFPageCountError
@@ -17,7 +14,10 @@ from pikepdf import Page
from pikepdf import Pdf
from PIL import Image
from documents.converters import convert_from_tiff_to_pdf
from documents.data_models import DocumentSource
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
logger = logging.getLogger("paperless.barcodes")
@@ -54,7 +54,7 @@ class BarcodeReader:
self.mime: Final[str] = mime_type
self.pdf_file: Path = self.file
self.barcodes: List[Barcode] = []
self.temp_dir: Optional[Path] = None
self.temp_dir: Optional[tempfile.TemporaryDirectory] = None
if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
self.SUPPORTED_FILE_MIMES = {"application/pdf", "image/tiff"}
@@ -154,34 +154,7 @@ class BarcodeReader:
if self.mime != "image/tiff":
return
with Image.open(self.file) as im:
has_alpha_layer = im.mode in ("RGBA", "LA")
if has_alpha_layer:
# Note the save into the temp folder, so as not to trigger a new
# consume
scratch_image = Path(self.temp_dir.name) / Path(self.file.name)
run(
[
settings.CONVERT_BINARY,
"-alpha",
"off",
self.file,
scratch_image,
],
)
else:
# Not modifying the original, safe to use in place
scratch_image = self.file
self.pdf_file = Path(self.temp_dir.name) / Path(self.file.name).with_suffix(
".pdf",
)
with scratch_image.open("rb") as img_file, self.pdf_file.open("wb") as pdf_file:
pdf_file.write(img2pdf.convert(img_file))
# Copy what file stat is possible
shutil.copystat(self.file, self.pdf_file)
self.pdf_file = convert_from_tiff_to_pdf(self.file, Path(self.temp_dir.name))
def detect(self) -> None:
"""
@@ -306,7 +279,7 @@ class BarcodeReader:
with open(savepath, "wb") as out:
dst.save(out)
shutil.copystat(self.file, savepath)
copy_basic_file_stats(self.file, savepath)
document_paths.append(savepath)
@@ -363,5 +336,5 @@ class BarcodeReader:
else:
dest = save_to_dir
logger.info(f"Saving {document_path} to {dest}")
shutil.copy2(document_path, dest)
copy_file_with_basic_stats(document_path, dest)
return True

View File

@@ -5,6 +5,7 @@ import re
import warnings
from datetime import datetime
from hashlib import sha256
from pathlib import Path
from typing import Iterator
from typing import List
from typing import Optional
@@ -81,7 +82,7 @@ class DocumentClassifier:
self._stemmer = None
self._stop_words = None
def load(self):
def load(self) -> None:
# Catch warnings for processing
with warnings.catch_warnings(record=True) as w:
with open(settings.MODEL_FILE, "rb") as f:
@@ -120,19 +121,20 @@ class DocumentClassifier:
raise IncompatibleClassifierVersionError
def save(self):
target_file = settings.MODEL_FILE
target_file_temp = settings.MODEL_FILE.with_suffix(".pickle.part")
target_file: Path = settings.MODEL_FILE
target_file_temp = target_file.with_suffix(".pickle.part")
with open(target_file_temp, "wb") as f:
pickle.dump(self.FORMAT_VERSION, f)
pickle.dump(self.last_doc_change_time, f)
pickle.dump(self.last_auto_type_hash, f)
pickle.dump(self.data_vectorizer, f)
pickle.dump(self.tags_binarizer, f)
pickle.dump(self.tags_classifier, f)
pickle.dump(self.correspondent_classifier, f)
pickle.dump(self.document_type_classifier, f)
pickle.dump(self.storage_path_classifier, f)
@@ -247,7 +249,7 @@ class DocumentClassifier:
data_vectorized = self.data_vectorizer.fit_transform(content_generator())
# See the notes here:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html # noqa: 501
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html # noqa: E501
# This attribute isn't needed to function and can be large
self.data_vectorizer.stop_words_ = None
@@ -380,7 +382,7 @@ class DocumentClassifier:
return content
def predict_correspondent(self, content: str):
def predict_correspondent(self, content: str) -> Optional[int]:
if self.correspondent_classifier:
X = self.data_vectorizer.transform([self.preprocess_content(content)])
correspondent_id = self.correspondent_classifier.predict(X)
@@ -391,7 +393,7 @@ class DocumentClassifier:
else:
return None
def predict_document_type(self, content: str):
def predict_document_type(self, content: str) -> Optional[int]:
if self.document_type_classifier:
X = self.data_vectorizer.transform([self.preprocess_content(content)])
document_type_id = self.document_type_classifier.predict(X)
@@ -402,7 +404,7 @@ class DocumentClassifier:
else:
return None
def predict_tags(self, content: str):
def predict_tags(self, content: str) -> List[int]:
from sklearn.utils.multiclass import type_of_target
if self.tags_classifier:
@@ -423,7 +425,7 @@ class DocumentClassifier:
else:
return []
def predict_storage_path(self, content: str):
def predict_storage_path(self, content: str) -> Optional[int]:
if self.storage_path_classifier:
X = self.data_vectorizer.transform([self.preprocess_content(content)])
storage_path_id = self.storage_path_classifier.predict(X)

View File

@@ -1,9 +1,9 @@
import datetime
import hashlib
import os
import shutil
import tempfile
import uuid
from enum import Enum
from pathlib import Path
from subprocess import CompletedProcess
from subprocess import run
@@ -21,6 +21,9 @@ from django.utils import timezone
from filelock import FileLock
from rest_framework.reverse import reverse
from documents.utils import copy_basic_file_stats
from documents.utils import copy_file_with_basic_stats
from .classifier import load_classifier
from .file_handling import create_source_path_directory
from .file_handling import generate_unique_filename
@@ -42,21 +45,30 @@ class ConsumerError(Exception):
pass
MESSAGE_DOCUMENT_ALREADY_EXISTS = "document_already_exists"
MESSAGE_ASN_ALREADY_EXISTS = "asn_already_exists"
MESSAGE_ASN_RANGE = "asn_value_out_of_range"
MESSAGE_FILE_NOT_FOUND = "file_not_found"
MESSAGE_PRE_CONSUME_SCRIPT_NOT_FOUND = "pre_consume_script_not_found"
MESSAGE_PRE_CONSUME_SCRIPT_ERROR = "pre_consume_script_error"
MESSAGE_POST_CONSUME_SCRIPT_NOT_FOUND = "post_consume_script_not_found"
MESSAGE_POST_CONSUME_SCRIPT_ERROR = "post_consume_script_error"
MESSAGE_NEW_FILE = "new_file"
MESSAGE_UNSUPPORTED_TYPE = "unsupported_type"
MESSAGE_PARSING_DOCUMENT = "parsing_document"
MESSAGE_GENERATING_THUMBNAIL = "generating_thumbnail"
MESSAGE_PARSE_DATE = "parse_date"
MESSAGE_SAVE_DOCUMENT = "save_document"
MESSAGE_FINISHED = "finished"
class ConsumerStatusShortMessage(str, Enum):
DOCUMENT_ALREADY_EXISTS = "document_already_exists"
ASN_ALREADY_EXISTS = "asn_already_exists"
ASN_RANGE = "asn_value_out_of_range"
FILE_NOT_FOUND = "file_not_found"
PRE_CONSUME_SCRIPT_NOT_FOUND = "pre_consume_script_not_found"
PRE_CONSUME_SCRIPT_ERROR = "pre_consume_script_error"
POST_CONSUME_SCRIPT_NOT_FOUND = "post_consume_script_not_found"
POST_CONSUME_SCRIPT_ERROR = "post_consume_script_error"
NEW_FILE = "new_file"
UNSUPPORTED_TYPE = "unsupported_type"
PARSING_DOCUMENT = "parsing_document"
GENERATING_THUMBNAIL = "generating_thumbnail"
PARSE_DATE = "parse_date"
SAVE_DOCUMENT = "save_document"
FINISHED = "finished"
FAILED = "failed"
class ConsumerFilePhase(str, Enum):
STARTED = "STARTED"
WORKING = "WORKING"
SUCCESS = "SUCCESS"
FAILED = "FAILED"
class Consumer(LoggingMixin):
@@ -64,10 +76,10 @@ class Consumer(LoggingMixin):
def _send_progress(
self,
current_progress,
max_progress,
status,
message=None,
current_progress: int,
max_progress: int,
status: ConsumerFilePhase,
message: Optional[ConsumerStatusShortMessage] = None,
document_id=None,
): # pragma: no cover
payload = {
@@ -86,12 +98,12 @@ class Consumer(LoggingMixin):
def _fail(
self,
message,
log_message=None,
message: ConsumerStatusShortMessage,
log_message: Optional[str] = None,
exc_info=None,
exception: Optional[Exception] = None,
):
self._send_progress(100, 100, "FAILED", message)
self._send_progress(100, 100, ConsumerFilePhase.FAILED, message)
self.log.error(log_message or message, exc_info=exc_info)
raise ConsumerError(f"{self.filename}: {log_message or message}") from exception
@@ -111,13 +123,19 @@ class Consumer(LoggingMixin):
self.channel_layer = get_channel_layer()
def pre_check_file_exists(self):
"""
Confirm the input file still exists where it should
"""
if not os.path.isfile(self.path):
self._fail(
MESSAGE_FILE_NOT_FOUND,
ConsumerStatusShortMessage.FILE_NOT_FOUND,
f"Cannot consume {self.path}: File not found.",
)
def pre_check_duplicate(self):
"""
Using the MD5 of the file, check this exact file doesn't already exist
"""
with open(self.path, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
existing_doc = Document.objects.filter(
@@ -127,12 +145,15 @@ class Consumer(LoggingMixin):
if settings.CONSUMER_DELETE_DUPLICATES:
os.unlink(self.path)
self._fail(
MESSAGE_DOCUMENT_ALREADY_EXISTS,
ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS,
f"Not consuming {self.filename}: It is a duplicate of"
f" {existing_doc.get().title} (#{existing_doc.get().pk})",
)
def pre_check_directories(self):
"""
Ensure all required directories exist before attempting to use them
"""
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
@@ -152,7 +173,7 @@ class Consumer(LoggingMixin):
or self.override_asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
):
self._fail(
MESSAGE_ASN_RANGE,
ConsumerStatusShortMessage.ASN_RANGE,
f"Not consuming {self.filename}: "
f"Given ASN {self.override_asn} is out of range "
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
@@ -160,17 +181,21 @@ class Consumer(LoggingMixin):
)
if Document.objects.filter(archive_serial_number=self.override_asn).exists():
self._fail(
MESSAGE_ASN_ALREADY_EXISTS,
ConsumerStatusShortMessage.ASN_ALREADY_EXISTS,
f"Not consuming {self.filename}: Given ASN already exists!",
)
def run_pre_consume_script(self):
"""
If one is configured and exists, run the pre-consume script and
handle its output and/or errors
"""
if not settings.PRE_CONSUME_SCRIPT:
return
if not os.path.isfile(settings.PRE_CONSUME_SCRIPT):
self._fail(
MESSAGE_PRE_CONSUME_SCRIPT_NOT_FOUND,
ConsumerStatusShortMessage.PRE_CONSUME_SCRIPT_NOT_FOUND,
f"Configured pre-consume script "
f"{settings.PRE_CONSUME_SCRIPT} does not exist.",
)
@@ -201,19 +226,23 @@ class Consumer(LoggingMixin):
except Exception as e:
self._fail(
MESSAGE_PRE_CONSUME_SCRIPT_ERROR,
ConsumerStatusShortMessage.PRE_CONSUME_SCRIPT_ERROR,
f"Error while executing pre-consume script: {e}",
exc_info=True,
exception=e,
)
def run_post_consume_script(self, document: Document):
"""
If one is configured and exists, run the pre-consume script and
handle its output and/or errors
"""
if not settings.POST_CONSUME_SCRIPT:
return
if not os.path.isfile(settings.POST_CONSUME_SCRIPT):
self._fail(
MESSAGE_POST_CONSUME_SCRIPT_NOT_FOUND,
ConsumerStatusShortMessage.POST_CONSUME_SCRIPT_NOT_FOUND,
f"Configured post-consume script "
f"{settings.POST_CONSUME_SCRIPT} does not exist.",
)
@@ -274,7 +303,7 @@ class Consumer(LoggingMixin):
except Exception as e:
self._fail(
MESSAGE_POST_CONSUME_SCRIPT_ERROR,
ConsumerStatusShortMessage.POST_CONSUME_SCRIPT_ERROR,
f"Error while executing post-consume script: {e}",
exc_info=True,
exception=e,
@@ -308,7 +337,12 @@ class Consumer(LoggingMixin):
self.override_asn = override_asn
self.override_owner_id = override_owner_id
self._send_progress(0, 100, "STARTING", MESSAGE_NEW_FILE)
self._send_progress(
0,
100,
ConsumerFilePhase.STARTED,
ConsumerStatusShortMessage.NEW_FILE,
)
# Make sure that preconditions for consuming the file are met.
@@ -326,7 +360,7 @@ class Consumer(LoggingMixin):
dir=settings.SCRATCH_DIR,
)
self.path = Path(tempdir.name) / Path(self.filename)
shutil.copy2(self.original_path, self.path)
copy_file_with_basic_stats(self.original_path, self.path)
# Determine the parser class.
@@ -340,7 +374,10 @@ class Consumer(LoggingMixin):
)
if not parser_class:
tempdir.cleanup()
self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}")
self._fail(
ConsumerStatusShortMessage.UNSUPPORTED_TYPE,
f"Unsupported mime type {mime_type}",
)
# Notify all listeners that we're going to do some work.
@@ -355,7 +392,7 @@ class Consumer(LoggingMixin):
def progress_callback(current_progress, max_progress): # pragma: no cover
# recalculate progress to be within 20 and 80
p = int((current_progress / max_progress) * 50 + 20)
self._send_progress(p, 100, "WORKING")
self._send_progress(p, 100, ConsumerFilePhase.WORKING)
# This doesn't parse the document yet, but gives us a parser.
@@ -377,12 +414,22 @@ class Consumer(LoggingMixin):
archive_path = None
try:
self._send_progress(20, 100, "WORKING", MESSAGE_PARSING_DOCUMENT)
self._send_progress(
20,
100,
ConsumerFilePhase.WORKING,
ConsumerStatusShortMessage.PARSING_DOCUMENT,
)
self.log.debug(f"Parsing {self.filename}...")
document_parser.parse(self.path, mime_type, self.filename)
self.log.debug(f"Generating thumbnail for {self.filename}...")
self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL)
self._send_progress(
70,
100,
ConsumerFilePhase.WORKING,
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
)
thumbnail = document_parser.get_thumbnail(
self.path,
mime_type,
@@ -392,7 +439,12 @@ class Consumer(LoggingMixin):
text = document_parser.get_text()
date = document_parser.get_date()
if date is None:
self._send_progress(90, 100, "WORKING", MESSAGE_PARSE_DATE)
self._send_progress(
90,
100,
ConsumerFilePhase.WORKING,
ConsumerStatusShortMessage.PARSE_DATE,
)
date = parse_date(self.filename, text)
archive_path = document_parser.get_archive_path()
@@ -414,7 +466,12 @@ class Consumer(LoggingMixin):
classifier = load_classifier()
self._send_progress(95, 100, "WORKING", MESSAGE_SAVE_DOCUMENT)
self._send_progress(
95,
100,
ConsumerFilePhase.WORKING,
ConsumerStatusShortMessage.SAVE_DOCUMENT,
)
# now that everything is done, we can start to store the document
# in the system. This will be a transaction and reasonably fast.
try:
@@ -499,7 +556,13 @@ class Consumer(LoggingMixin):
self.log.info(f"Document {document} consumption finished")
self._send_progress(100, 100, "SUCCESS", MESSAGE_FINISHED, document.id)
self._send_progress(
100,
100,
ConsumerFilePhase.SUCCESS,
ConsumerStatusShortMessage.FINISHED,
document.id,
)
# Return the most up to date fields
document.refresh_from_db()
@@ -585,7 +648,7 @@ class Consumer(LoggingMixin):
# Attempt to copy file's original stats, but it's ok if we can't
try:
shutil.copystat(source, target)
copy_basic_file_stats(source, target)
except Exception: # pragma: no cover
pass

View File

@@ -0,0 +1,46 @@
from pathlib import Path
from subprocess import run
import img2pdf
from django.conf import settings
from PIL import Image
from documents.utils import copy_basic_file_stats
def convert_from_tiff_to_pdf(tiff_path: Path, target_directory: Path) -> Path:
"""
Converts a TIFF file into a PDF file.
The PDF will be created in the given target_directory and share the name of
the original TIFF file, as well as its stats (mtime etc.).
Returns the path of the PDF created.
"""
with Image.open(tiff_path) as im:
has_alpha_layer = im.mode in ("RGBA", "LA")
if has_alpha_layer:
# Note the save into the temp folder, so as not to trigger a new
# consume
scratch_image = target_directory / tiff_path.name
run(
[
settings.CONVERT_BINARY,
"-alpha",
"off",
tiff_path,
scratch_image,
],
)
else:
# Not modifying the original, safe to use in place
scratch_image = tiff_path
pdf_path = (target_directory / tiff_path.name).with_suffix(".pdf")
with scratch_image.open("rb") as img_file, pdf_path.open("wb") as pdf_file:
pdf_file.write(img2pdf.convert(img_file))
# Copy what file stat is possible
copy_basic_file_stats(tiff_path, pdf_path)
return pdf_path

View File

@@ -0,0 +1,131 @@
import datetime as dt
import logging
import os
import shutil
from pathlib import Path
from django.conf import settings
from pikepdf import Pdf
from documents.consumer import ConsumerError
from documents.converters import convert_from_tiff_to_pdf
from documents.data_models import ConsumableDocument
logger = logging.getLogger("paperless.double_sided")
# Hardcoded for now, could be made a configurable setting if needed
TIMEOUT_MINUTES = 30
# Used by test cases
STAGING_FILE_NAME = "double-sided-staging.pdf"
def collate(input_doc: ConsumableDocument) -> str:
"""
Tries to collate pages from 2 single sided scans of a double sided
document.
When called with a file, it checks whether or not a staging file
exists, if not, the current file is turned into that staging file
containing the odd numbered pages.
If a staging file exists, and it is not too old, the current file is
considered to be the second part (the even numbered pages) and it will
collate the pages of both, the pages of the second file will be added
in reverse order, since the ADF will have scanned the pages from bottom
to top.
Returns a status message on succcess, or raises a ConsumerError
in case of failure.
"""
# Make sure scratch dir exists, Consumer might not have run yet
settings.SCRATCH_DIR.mkdir(exist_ok=True)
if input_doc.mime_type == "application/pdf":
pdf_file = input_doc.original_file
elif (
input_doc.mime_type == "image/tiff"
and settings.CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT
):
pdf_file = convert_from_tiff_to_pdf(
input_doc.original_file,
settings.SCRATCH_DIR,
)
input_doc.original_file.unlink()
else:
raise ConsumerError("Unsupported file type for collation of double-sided scans")
staging = settings.SCRATCH_DIR / STAGING_FILE_NAME
valid_staging_exists = False
if staging.exists():
stats = os.stat(str(staging))
# if the file is older than the timeout, we don't consider
# it valid
if dt.datetime.now().timestamp() - stats.st_mtime > TIMEOUT_MINUTES * 60:
logger.warning("Outdated double sided staging file exists, deleting it")
os.unlink(str(staging))
else:
valid_staging_exists = True
if valid_staging_exists:
try:
# Collate pages from second PDF in reverse order
with Pdf.open(staging) as pdf1, Pdf.open(pdf_file) as pdf2:
pdf2.pages.reverse()
try:
for i, page in enumerate(pdf2.pages):
pdf1.pages.insert(2 * i + 1, page)
except IndexError:
raise ConsumerError(
"This second file (even numbered pages) contains more "
"pages than the first/odd numbered one. This means the "
"two uploaded files don't belong to the same double-"
"sided scan. Please retry, starting with the odd "
"numbered pages again.",
)
# Merged file has the same path, but without the
# double-sided subdir. Therefore, it is also in the
# consumption dir and will be picked up for processing
old_file = input_doc.original_file
new_file = Path(
*(
part
for part in old_file.with_name(
f"{old_file.stem}-collated.pdf",
).parts
if part != settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
),
)
# If the user didn't create the subdirs yet, do it for them
new_file.parent.mkdir(parents=True, exist_ok=True)
pdf1.save(new_file)
logger.info("Collated documents into new file %s", new_file)
return (
"Success. Even numbered pages of double sided scan collated "
"with odd pages"
)
finally:
# Delete staging and recently uploaded file no matter what.
# If any error occurs, the user needs to be able to restart
# the process from scratch; after all, the staging file
# with the odd numbered pages might be the culprit
pdf_file.unlink()
staging.unlink()
else:
# In Python 3.9 move supports Path objects directly,
# but for now we have to be compatible with 3.8
shutil.move(str(pdf_file), str(staging))
# update access to modification time so we know if the file
# is outdated when another file gets uploaded
os.utime(str(staging), (dt.datetime.now().timestamp(),) * 2)
logger.info(
"Got scan with odd numbered pages of double-sided scan, moved it to %s",
staging,
)
return (
"Received odd numbered pages of double sided scan, waiting up to "
f"{TIMEOUT_MINUTES} minutes for even numbered pages"
)

View File

@@ -218,6 +218,7 @@ def generate_filename(
tag_list=tag_list,
owner_username=owner_username_str,
original_name=original_name,
doc_pk=f"{doc.pk:07}",
).strip()
if settings.FILENAME_FORMAT_REMOVE_NONE:

View File

@@ -11,13 +11,17 @@ from typing import Set
import tqdm
from django.conf import settings
from django.contrib.auth.models import Group
from django.contrib.auth.models import Permission
from django.contrib.auth.models import User
from django.contrib.contenttypes.models import ContentType
from django.core import serializers
from django.core.management.base import BaseCommand
from django.core.management.base import CommandError
from django.db import transaction
from django.utils import timezone
from filelock import FileLock
from guardian.models import GroupObjectPermission
from guardian.models import UserObjectPermission
from documents.file_handling import delete_empty_directories
from documents.file_handling import generate_filename
@@ -33,6 +37,7 @@ from documents.models import UiSettings
from documents.settings import EXPORTER_ARCHIVE_NAME
from documents.settings import EXPORTER_FILE_NAME
from documents.settings import EXPORTER_THUMBNAIL_NAME
from documents.utils import copy_file_with_basic_stats
from paperless import version
from paperless.db import GnuPG
from paperless_mail.models import MailAccount
@@ -261,6 +266,22 @@ class Command(BaseCommand):
serializers.serialize("json", UiSettings.objects.all()),
)
manifest += json.loads(
serializers.serialize("json", ContentType.objects.all()),
)
manifest += json.loads(
serializers.serialize("json", Permission.objects.all()),
)
manifest += json.loads(
serializers.serialize("json", UserObjectPermission.objects.all()),
)
manifest += json.loads(
serializers.serialize("json", GroupObjectPermission.objects.all()),
)
# 3. Export files from each document
for index, document_dict in tqdm.tqdm(
enumerate(document_manifest),
@@ -417,4 +438,4 @@ class Command(BaseCommand):
if perform_copy:
target.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(source, target)
copy_file_with_basic_stats(source, target)

View File

@@ -1,17 +1,20 @@
import json
import logging
import os
import shutil
from contextlib import contextmanager
from pathlib import Path
import tqdm
from django.conf import settings
from django.contrib.auth.models import Permission
from django.contrib.contenttypes.models import ContentType
from django.core.exceptions import FieldDoesNotExist
from django.core.management import call_command
from django.core.management.base import BaseCommand
from django.core.management.base import CommandError
from django.core.serializers.base import DeserializationError
from django.db import IntegrityError
from django.db import transaction
from django.db.models.signals import m2m_changed
from django.db.models.signals import post_save
from filelock import FileLock
@@ -23,6 +26,7 @@ from documents.settings import EXPORTER_ARCHIVE_NAME
from documents.settings import EXPORTER_FILE_NAME
from documents.settings import EXPORTER_THUMBNAIL_NAME
from documents.signals.handlers import update_filename_and_move_files
from documents.utils import copy_file_with_basic_stats
from paperless import version
@@ -116,9 +120,13 @@ class Command(BaseCommand):
):
# Fill up the database with whatever is in the manifest
try:
for manifest_path in manifest_paths:
call_command("loaddata", manifest_path)
except (FieldDoesNotExist, DeserializationError) as e:
with transaction.atomic():
for manifest_path in manifest_paths:
# delete these since pk can change, re-created from import
ContentType.objects.all().delete()
Permission.objects.all().delete()
call_command("loaddata", manifest_path)
except (FieldDoesNotExist, DeserializationError, IntegrityError) as e:
self.stdout.write(self.style.ERROR("Database import failed"))
if (
self.version is not None
@@ -238,7 +246,7 @@ class Command(BaseCommand):
create_source_path_directory(document.source_path)
shutil.copy2(document_path, document.source_path)
copy_file_with_basic_stats(document_path, document.source_path)
if thumbnail_path:
if thumbnail_path.suffix in {".png", ".PNG"}:
@@ -253,13 +261,16 @@ class Command(BaseCommand):
output_file=str(document.thumbnail_path),
)
else:
shutil.copy2(thumbnail_path, document.thumbnail_path)
copy_file_with_basic_stats(
thumbnail_path,
document.thumbnail_path,
)
if archive_path:
create_source_path_directory(document.archive_path)
# TODO: this assumes that the export is valid and
# archive_filename is present on all documents with
# archived files
shutil.copy2(archive_path, document.archive_path)
copy_file_with_basic_stats(archive_path, document.archive_path)
document.save()

View File

@@ -1,7 +1,9 @@
import logging
import re
from documents.classifier import DocumentClassifier
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import MatchingModel
from documents.models import StoragePath
@@ -11,7 +13,7 @@ from documents.permissions import get_objects_for_user_owner_aware
logger = logging.getLogger("paperless.matching")
def log_reason(matching_model, document, reason):
def log_reason(matching_model: MatchingModel, document: Document, reason: str):
class_name = type(matching_model).__name__
logger.debug(
f"{class_name} {matching_model.name} matched on document "
@@ -19,7 +21,7 @@ def log_reason(matching_model, document, reason):
)
def match_correspondents(document, classifier, user=None):
def match_correspondents(document: Document, classifier: DocumentClassifier, user=None):
pred_id = classifier.predict_correspondent(document.content) if classifier else None
if user is None and document.owner is not None:
@@ -35,11 +37,15 @@ def match_correspondents(document, classifier, user=None):
correspondents = Correspondent.objects.all()
return list(
filter(lambda o: matches(o, document) or o.pk == pred_id, correspondents),
filter(
lambda o: matches(o, document)
or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
correspondents,
),
)
def match_document_types(document, classifier, user=None):
def match_document_types(document: Document, classifier: DocumentClassifier, user=None):
pred_id = classifier.predict_document_type(document.content) if classifier else None
if user is None and document.owner is not None:
@@ -55,11 +61,15 @@ def match_document_types(document, classifier, user=None):
document_types = DocumentType.objects.all()
return list(
filter(lambda o: matches(o, document) or o.pk == pred_id, document_types),
filter(
lambda o: matches(o, document)
or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
document_types,
),
)
def match_tags(document, classifier, user=None):
def match_tags(document: Document, classifier: DocumentClassifier, user=None):
predicted_tag_ids = classifier.predict_tags(document.content) if classifier else []
if user is None and document.owner is not None:
@@ -71,11 +81,18 @@ def match_tags(document, classifier, user=None):
tags = Tag.objects.all()
return list(
filter(lambda o: matches(o, document) or o.pk in predicted_tag_ids, tags),
filter(
lambda o: matches(o, document)
or (
o.matching_algorithm == MatchingModel.MATCH_AUTO
and o.pk in predicted_tag_ids
),
tags,
),
)
def match_storage_paths(document, classifier, user=None):
def match_storage_paths(document: Document, classifier: DocumentClassifier, user=None):
pred_id = classifier.predict_storage_path(document.content) if classifier else None
if user is None and document.owner is not None:
@@ -92,13 +109,14 @@ def match_storage_paths(document, classifier, user=None):
return list(
filter(
lambda o: matches(o, document) or o.pk == pred_id,
lambda o: matches(o, document)
or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
storage_paths,
),
)
def matches(matching_model, document):
def matches(matching_model: MatchingModel, document: Document):
search_kwargs = {}
document_content = document.content

View File

@@ -0,0 +1,162 @@
# Generated by Django 4.1.9 on 2023-06-29 19:29
import logging
import multiprocessing.pool
import shutil
import tempfile
import time
from pathlib import Path
import gnupg
from django.conf import settings
from django.db import migrations
from documents.parsers import run_convert
logger = logging.getLogger("paperless.migrations")
def _do_convert(work_package):
(
existing_encrypted_thumbnail,
converted_encrypted_thumbnail,
passphrase,
) = work_package
try:
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
logger.info(f"Decrypting thumbnail: {existing_encrypted_thumbnail}")
# Decrypt png
decrypted_thumbnail = existing_encrypted_thumbnail.with_suffix("").resolve()
with open(existing_encrypted_thumbnail, "rb") as existing_encrypted_file:
raw_thumb = gpg.decrypt_file(
existing_encrypted_file,
passphrase=passphrase,
always_trust=True,
).data
with open(decrypted_thumbnail, "wb") as decrypted_file:
decrypted_file.write(raw_thumb)
converted_decrypted_thumbnail = Path(
str(converted_encrypted_thumbnail).replace("webp.gpg", "webp"),
).resolve()
logger.info(f"Converting decrypted thumbnail: {decrypted_thumbnail}")
# Convert to webp
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=f"{decrypted_thumbnail}[0]",
output_file=str(converted_decrypted_thumbnail),
)
logger.info(
f"Encrypting converted thumbnail: {converted_decrypted_thumbnail}",
)
# Encrypt webp
with open(converted_decrypted_thumbnail, "rb") as converted_decrypted_file:
encrypted = gpg.encrypt_file(
fileobj_or_path=converted_decrypted_file,
recipients=None,
passphrase=passphrase,
symmetric=True,
always_trust=True,
).data
with open(converted_encrypted_thumbnail, "wb") as converted_encrypted_file:
converted_encrypted_file.write(encrypted)
# Copy newly created thumbnail to thumbnail directory
shutil.copy(converted_encrypted_thumbnail, existing_encrypted_thumbnail.parent)
# Remove the existing encrypted PNG version
existing_encrypted_thumbnail.unlink()
# Remove the decrypted PNG version
decrypted_thumbnail.unlink()
# Remove the decrypted WebP version
converted_decrypted_thumbnail.unlink()
logger.info(
"Conversion to WebP completed, "
f"replaced {existing_encrypted_thumbnail.name} with {converted_encrypted_thumbnail.name}",
)
except Exception as e:
logger.error(f"Error converting thumbnail (existing file unchanged): {e}")
def _convert_encrypted_thumbnails_to_webp(apps, schema_editor):
start = time.time()
with tempfile.TemporaryDirectory() as tempdir:
work_packages = []
if len(list(Path(settings.THUMBNAIL_DIR).glob("*.png.gpg"))) > 0:
passphrase = settings.PASSPHRASE
if not passphrase:
raise Exception(
"Passphrase not defined, encrypted thumbnails cannot be migrated"
"without this",
)
for file in Path(settings.THUMBNAIL_DIR).glob("*.png.gpg"):
existing_thumbnail = file.resolve()
# Change the existing filename suffix from png to webp
converted_thumbnail_name = Path(
str(existing_thumbnail).replace(".png.gpg", ".webp.gpg"),
).name
# Create the expected output filename in the tempdir
converted_thumbnail = (
Path(tempdir) / Path(converted_thumbnail_name)
).resolve()
# Package up the necessary info
work_packages.append(
(existing_thumbnail, converted_thumbnail, passphrase),
)
if len(work_packages):
logger.info(
"\n\n"
" This is a one-time only migration to convert thumbnails for all of your\n"
" *encrypted* documents into WebP format. If you have a lot of encrypted documents, \n"
" this may take a while, so a coffee break may be in order."
"\n",
)
with multiprocessing.pool.Pool(
processes=min(multiprocessing.cpu_count(), 4),
maxtasksperchild=4,
) as pool:
pool.map(_do_convert, work_packages)
end = time.time()
duration = end - start
logger.info(f"Conversion completed in {duration:.3f}s")
class Migration(migrations.Migration):
dependencies = [
("documents", "1036_alter_savedviewfilterrule_rule_type"),
]
operations = [
migrations.RunPython(
code=_convert_encrypted_thumbnails_to_webp,
reverse_code=migrations.RunPython.noop,
),
]

View File

@@ -18,6 +18,7 @@ from django.utils import timezone
from documents.loggers import LoggingMixin
from documents.signals import document_consumer_declaration
from documents.utils import copy_file_with_basic_stats
# This regular expression will try to find dates in the document at
# hand and will match the following formats:
@@ -31,16 +32,18 @@ from documents.signals import document_consumer_declaration
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
# - XX MON ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits. MONTH is 3 letters
# - XXPP MONTH ZZZZ with XX being 1 or 2 and PP being 2 letters and ZZZZ being 4 digits
# TODO: isnt there a date parsing library for this?
DATE_REGEX = re.compile(
r"(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|" # noqa: E501
r"(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|" # noqa: E501
r"(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|" # noqa: E501
r"(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[a-zA-Z]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|" # noqa: E501
r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|"
r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))|"
r"(\b|(?!=([_-])))(\b[0-9]{1,2}[ \.\/-][A-Z]{3}[ \.\/-][0-9]{4})(\b|(?=([_-])))", # noqa: E501
r"(\b|(?!=([_-])))([0-9]{1,2}[^ ]{2}[\. ]+[^ ]{3,9}[ \.\/-][0-9]{4})(\b|(?=([_-])))|" # noqa: E501
r"(\b|(?!=([_-])))(\b[0-9]{1,2}[ \.\/-][a-zA-Z]{3}[ \.\/-][0-9]{4})(\b|(?=([_-])))", # noqa: E501
)
@@ -206,7 +209,7 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -
# so we need to copy it before it gets moved.
# https://github.com/paperless-ngx/paperless-ngx/issues/3631
default_thumbnail_path = os.path.join(temp_dir, "document.png")
shutil.copy2(get_default_thumbnail(), default_thumbnail_path)
copy_file_with_basic_stats(get_default_thumbnail(), default_thumbnail_path)
return default_thumbnail_path

View File

@@ -1,6 +1,7 @@
import logging
import os
import shutil
from typing import Optional
from celery import states
from celery.signals import before_task_publish
@@ -21,6 +22,7 @@ from django.utils import timezone
from filelock import FileLock
from documents import matching
from documents.classifier import DocumentClassifier
from documents.file_handling import create_source_path_directory
from documents.file_handling import delete_empty_directories
from documents.file_handling import generate_unique_filename
@@ -33,7 +35,7 @@ from documents.permissions import get_objects_for_user_owner_aware
logger = logging.getLogger("paperless.handlers")
def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
def add_inbox_tags(sender, document: Document, logging_group=None, **kwargs):
if document.owner is not None:
tags = get_objects_for_user_owner_aware(
document.owner,
@@ -48,9 +50,9 @@ def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
def set_correspondent(
sender,
document=None,
document: Document,
logging_group=None,
classifier=None,
classifier: Optional[DocumentClassifier] = None,
replace=False,
use_first=True,
suggest=False,
@@ -111,9 +113,9 @@ def set_correspondent(
def set_document_type(
sender,
document=None,
document: Document,
logging_group=None,
classifier=None,
classifier: Optional[DocumentClassifier] = None,
replace=False,
use_first=True,
suggest=False,
@@ -175,9 +177,9 @@ def set_document_type(
def set_tags(
sender,
document=None,
document: Document,
logging_group=None,
classifier=None,
classifier: Optional[DocumentClassifier] = None,
replace=False,
suggest=False,
base_url=None,
@@ -239,9 +241,9 @@ def set_tags(
def set_storage_path(
sender,
document=None,
document: Document,
logging_group=None,
classifier=None,
classifier: Optional[DocumentClassifier] = None,
replace=False,
use_first=True,
suggest=False,
@@ -491,7 +493,7 @@ def update_filename_and_move_files(sender, instance: Document, **kwargs):
)
def set_log_entry(sender, document=None, logging_group=None, **kwargs):
def set_log_entry(sender, document: Document, logging_group=None, **kwargs):
ct = ContentType.objects.get(model="document")
user = User.objects.get(username="consumer")

View File

@@ -25,6 +25,7 @@ from documents.consumer import Consumer
from documents.consumer import ConsumerError
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentMetadataOverrides
from documents.double_sided import collate
from documents.file_handling import create_source_path_directory
from documents.file_handling import generate_unique_filename
from documents.models import Correspondent
@@ -64,6 +65,12 @@ def train_classifier():
and not Correspondent.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
and not StoragePath.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
):
logger.info("No automatic matching items, not training")
# Special case, items were once auto and trained, so remove the model
# and prevent its use again
if settings.MODEL_FILE.exists():
logger.info(f"Removing {settings.MODEL_FILE} so it won't be used")
settings.MODEL_FILE.unlink()
return
classifier = load_classifier()
@@ -89,10 +96,40 @@ def consume_file(
input_doc: ConsumableDocument,
overrides: Optional[DocumentMetadataOverrides] = None,
):
def send_progress(status="SUCCESS", message="finished"):
payload = {
"filename": overrides.filename or input_doc.original_file.name,
"task_id": None,
"current_progress": 100,
"max_progress": 100,
"status": status,
"message": message,
}
try:
async_to_sync(get_channel_layer().group_send)(
"status_updates",
{"type": "status_update", "data": payload},
)
except ConnectionError as e:
logger.warning(f"ConnectionError on status send: {e!s}")
# Default no overrides
if overrides is None:
overrides = DocumentMetadataOverrides()
# Handle collation of double-sided documents scanned in two parts
if settings.CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED and (
settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
in input_doc.original_file.parts
):
try:
msg = collate(input_doc)
send_progress(message=msg)
return msg
except ConsumerError as e:
send_progress(status="FAILURE", message=e.args[0])
raise e
# read all barcodes in the current document
if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE:
with BarcodeReader(input_doc.original_file, input_doc.mime_type) as reader:
@@ -102,32 +139,18 @@ def consume_file(
):
# notify the sender, otherwise the progress bar
# in the UI stays stuck
payload = {
"filename": overrides.filename or input_doc.original_file.name,
"task_id": None,
"current_progress": 100,
"max_progress": 100,
"status": "SUCCESS",
"message": "finished",
}
try:
async_to_sync(get_channel_layer().group_send)(
"status_updates",
{"type": "status_update", "data": payload},
)
except ConnectionError as e:
logger.warning(f"ConnectionError on status send: {e!s}")
send_progress()
# consuming stops here, since the original document with
# the barcodes has been split and will be consumed separately
input_doc.original_file.unlink()
return "File successfully split"
# try reading the ASN from barcode
if settings.CONSUMER_ENABLE_ASN_BARCODE:
if settings.CONSUMER_ENABLE_ASN_BARCODE and reader.asn is not None:
# Note this will take precedence over an API provided ASN
# But it's from a physical barcode, so that's good
overrides.asn = reader.asn
if overrides.asn:
logger.info(f"Found ASN in barcode: {overrides.asn}")
logger.info(f"Found ASN in barcode: {overrides.asn}")
# continue with consumption if no barcode was found
document = Consumer().try_consume_file(

Binary file not shown.

Binary file not shown.

View File

@@ -2369,6 +2369,62 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
self.assertEqual(resp_data["note"], "this is a posted note")
def test_notes_permissions_aware(self):
"""
GIVEN:
- Existing document owned by user2 but with granted view perms for user1
WHEN:
- API request is made by user1 to add a note or delete
THEN:
- Notes are neither created nor deleted
"""
user1 = User.objects.create_user(username="test1")
user1.user_permissions.add(*Permission.objects.all())
user1.save()
user2 = User.objects.create_user(username="test2")
user2.save()
doc = Document.objects.create(
title="test",
mime_type="application/pdf",
content="this is a document which will have notes added",
)
doc.owner = user2
doc.save()
self.client.force_authenticate(user1)
resp = self.client.get(
f"/api/documents/{doc.pk}/notes/",
format="json",
)
self.assertEqual(resp.content, b"Insufficient permissions to view")
self.assertEqual(resp.status_code, status.HTTP_403_FORBIDDEN)
assign_perm("view_document", user1, doc)
resp = self.client.post(
f"/api/documents/{doc.pk}/notes/",
data={"note": "this is a posted note"},
)
self.assertEqual(resp.content, b"Insufficient permissions to create")
self.assertEqual(resp.status_code, status.HTTP_403_FORBIDDEN)
note = Note.objects.create(
note="This is a note.",
document=doc,
user=user2,
)
response = self.client.delete(
f"/api/documents/{doc.pk}/notes/?id={note.pk}",
format="json",
)
self.assertEqual(response.content, b"Insufficient permissions to delete")
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
def test_delete_note(self):
"""
GIVEN:

View File

@@ -21,6 +21,7 @@ from django.utils import timezone
from documents.consumer import Consumer
from documents.consumer import ConsumerError
from documents.consumer import ConsumerFilePhase
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
@@ -228,8 +229,8 @@ def fake_magic_from_file(file, mime=False):
class TestConsumer(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def _assert_first_last_send_progress(
self,
first_status="STARTING",
last_status="SUCCESS",
first_status=ConsumerFilePhase.STARTED,
last_status=ConsumerFilePhase.SUCCESS,
first_progress=0,
first_progress_max=100,
last_progress=100,
@@ -561,10 +562,16 @@ class TestConsumer(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
@mock.patch("documents.consumer.load_classifier")
def testClassifyDocument(self, m):
correspondent = Correspondent.objects.create(name="test")
dtype = DocumentType.objects.create(name="test")
t1 = Tag.objects.create(name="t1")
t2 = Tag.objects.create(name="t2")
correspondent = Correspondent.objects.create(
name="test",
matching_algorithm=Correspondent.MATCH_AUTO,
)
dtype = DocumentType.objects.create(
name="test",
matching_algorithm=DocumentType.MATCH_AUTO,
)
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO)
t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_AUTO)
m.return_value = MagicMock()
m.return_value.predict_correspondent.return_value = correspondent.pk

View File

@@ -152,6 +152,55 @@ class TestDate(TestCase):
text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304"
self.assertIsNone(parse_date("", text), None)
def test_date_format_19(self):
text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304"
self.assertEqual(
parse_date("", text),
datetime.datetime(2022, 3, 21, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_20(self):
text = "Customer Number Currency 22nd March 2022 Credit Card 1934829304"
self.assertEqual(
parse_date("", text),
datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_21(self):
text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304"
self.assertEqual(
parse_date("", text),
datetime.datetime(2022, 3, 2, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_22(self):
text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304"
self.assertEqual(
parse_date("", text),
datetime.datetime(2022, 3, 23, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_23(self):
text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304"
self.assertEqual(
parse_date("", text),
datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_24(self):
text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304"
self.assertEqual(
parse_date("", text),
datetime.datetime(2022, 3, 21, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_date_format_25(self):
text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304"
self.assertEqual(
parse_date("", text),
datetime.datetime(2022, 3, 25, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_crazy_date_past(self, *args):
self.assertIsNone(parse_date("", "01-07-0590 00:00:00"))

View File

@@ -0,0 +1,253 @@
import datetime as dt
import os
import shutil
from pathlib import Path
from typing import Union
from unittest import mock
from django.test import TestCase
from django.test import override_settings
from pdfminer.high_level import extract_text
from pikepdf import Pdf
from documents import tasks
from documents.consumer import ConsumerError
from documents.data_models import ConsumableDocument
from documents.data_models import DocumentSource
from documents.double_sided import STAGING_FILE_NAME
from documents.double_sided import TIMEOUT_MINUTES
from documents.tests.utils import DirectoriesMixin
from documents.tests.utils import FileSystemAssertsMixin
@override_settings(
CONSUMER_RECURSIVE=True,
CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED=True,
)
class TestDoubleSided(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_DIR = Path(__file__).parent / "samples"
def setUp(self):
super().setUp()
self.dirs.double_sided_dir = self.dirs.consumption_dir / "double-sided"
self.dirs.double_sided_dir.mkdir()
self.staging_file = self.dirs.scratch_dir / STAGING_FILE_NAME
def consume_file(self, srcname, dstname: Union[str, Path] = "foo.pdf"):
"""
Starts the consume process and also ensures the
destination file does not exist afterwards
"""
src = self.SAMPLE_DIR / srcname
dst = self.dirs.double_sided_dir / dstname
dst.parent.mkdir(parents=True, exist_ok=True)
shutil.copy(src, dst)
with mock.patch("documents.tasks.async_to_sync"), mock.patch(
"documents.consumer.async_to_sync",
):
msg = tasks.consume_file(
ConsumableDocument(
source=DocumentSource.ConsumeFolder,
original_file=dst,
),
None,
)
self.assertIsNotFile(dst)
return msg
def create_staging_file(self, src="double-sided-odd.pdf", datetime=None):
shutil.copy(self.SAMPLE_DIR / src, self.staging_file)
if datetime is None:
datetime = dt.datetime.now()
os.utime(str(self.staging_file), (datetime.timestamp(),) * 2)
def test_odd_numbered_moved_to_staging(self):
"""
GIVEN:
- No staging file exists
WHEN:
- A file is copied into the double-sided consume directory
THEN:
- The file becomes the new staging file
- The file in the consume directory gets removed
- The staging file has the st_mtime set to now
- The user gets informed
"""
msg = self.consume_file("double-sided-odd.pdf")
self.assertIsFile(self.staging_file)
self.assertAlmostEqual(
dt.datetime.fromtimestamp(self.staging_file.stat().st_mtime),
dt.datetime.now(),
delta=dt.timedelta(seconds=5),
)
self.assertIn("Received odd numbered pages", msg)
def test_collation(self):
"""
GIVEN:
- A staging file not older than TIMEOUT_MINUTES with odd pages exists
WHEN:
- A file is copied into the double-sided consume directory
THEN:
- A new file containing the collated staging and uploaded file is
created and put into the consume directory
- The new file is named "foo-collated.pdf", where foo is the name of
the second file
- Both staging and uploaded file get deleted
- The new file contains the pages in the correct order
"""
self.create_staging_file()
self.consume_file("double-sided-even.pdf", "some-random-name.pdf")
target = self.dirs.consumption_dir / "some-random-name-collated.pdf"
self.assertIsFile(target)
self.assertIsNotFile(self.staging_file)
self.assertRegex(
extract_text(str(target)),
r"(?s)"
r"This is page 1.*This is page 2.*This is page 3.*"
r"This is page 4.*This is page 5",
)
def test_staging_file_expiration(self):
"""
GIVEN:
- A staging file older than TIMEOUT_MINUTES exists
WHEN:
- A file is copied into the double-sided consume directory
THEN:
- It becomes the new staging file
"""
self.create_staging_file(
datetime=dt.datetime.now()
- dt.timedelta(minutes=TIMEOUT_MINUTES, seconds=1),
)
msg = self.consume_file("double-sided-odd.pdf")
self.assertIsFile(self.staging_file)
self.assertIn("Received odd numbered pages", msg)
def test_less_odd_pages_then_even_fails(self):
"""
GIVEN:
- A valid staging file
WHEN:
- A file is copied into the double-sided consume directory
that has more pages than the staging file
THEN:
- Both files get removed
- A ConsumerError exception is thrown
"""
self.create_staging_file("simple.pdf")
self.assertRaises(
ConsumerError,
self.consume_file,
"double-sided-even.pdf",
)
self.assertIsNotFile(self.staging_file)
@override_settings(CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT=True)
def test_tiff_upload_enabled(self):
"""
GIVEN:
- CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT is true
- No staging file exists
WHEN:
- A TIFF file gets uploaded into the double-sided
consume dir
THEN:
- The file is converted into a PDF and moved to
the staging file
"""
self.consume_file("simple.tiff", "simple.tiff")
self.assertIsFile(self.staging_file)
# Ensure the file is a valid PDF by trying to read it
Pdf.open(self.staging_file)
@override_settings(CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT=False)
def test_tiff_upload_disabled(self):
"""
GIVEN:
- CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT is false
- No staging file exists
WHEN:
- A TIFF file gets uploaded into the double-sided
consume dir
THEN:
- A ConsumerError is raised
"""
self.assertRaises(
ConsumerError,
self.consume_file,
"simple.tiff",
"simple.tiff",
)
@override_settings(CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME="quux")
def test_different_upload_dir_name(self):
"""
GIVEN:
- No staging file exists
- CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME is set to quux
WHEN:
- A file is uploaded into the quux dir
THEN:
- A staging file is created
"""
self.consume_file("double-sided-odd.pdf", Path("..") / "quux" / "foo.pdf")
self.assertIsFile(self.staging_file)
def test_only_double_sided_dir_is_handled(self):
"""
GIVEN:
- No staging file exists
WHEN:
- A file is uploaded into the normal consumption dir
THEN:
- The file is processed as normal
"""
msg = self.consume_file("simple.pdf", Path("..") / "simple.pdf")
self.assertIsNotFile(self.staging_file)
self.assertRegex(msg, "Success. New document .* created")
def test_subdirectory_upload(self):
"""
GIVEN:
- A staging file exists
WHEN:
- A file gets uploaded into foo/bar/double-sided
or double-sided/foo/bar
THEN:
- The collated file gets put into foo/bar
"""
for path in [
Path("foo") / "bar" / "double-sided",
Path("double-sided") / "foo" / "bar",
]:
with self.subTest(path=path):
# Ensure we get fresh directories for each run
self.tearDown()
self.setUp()
self.create_staging_file()
self.consume_file("double-sided-odd.pdf", path / "foo.pdf")
self.assertIsFile(
self.dirs.consumption_dir / "foo" / "bar" / "foo-collated.pdf",
)
@override_settings(CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED=False)
def test_disabled_double_sided_dir_upload(self):
"""
GIVEN:
- CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED is false
WHEN:
- A file is uploaded into the double-sided directory
THEN:
- The file is processed like a normal upload
"""
msg = self.consume_file("simple.pdf")
self.assertIsNotFile(self.staging_file)
self.assertRegex(msg, "Success. New document .* created")

View File

@@ -446,6 +446,19 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertIsNotDir(os.path.join(settings.ORIGINALS_DIR, "none"))
self.assertIsDir(settings.ORIGINALS_DIR)
@override_settings(FILENAME_FORMAT="{doc_pk}")
def test_format_doc_pk(self):
document = Document()
document.pk = 1
document.mime_type = "application/pdf"
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
self.assertEqual(generate_filename(document), "0000001.pdf")
document.pk = 13579
self.assertEqual(generate_filename(document), "0013579.pdf")
@override_settings(FILENAME_FORMAT=None)
def test_format_none(self):
document = Document()

View File

@@ -7,11 +7,18 @@ from pathlib import Path
from unittest import mock
from zipfile import ZipFile
from django.contrib.auth.models import Group
from django.contrib.auth.models import Permission
from django.contrib.contenttypes.models import ContentType
from django.core.management import call_command
from django.core.management.base import CommandError
from django.db import IntegrityError
from django.test import TestCase
from django.test import override_settings
from django.utils import timezone
from guardian.models import GroupObjectPermission
from guardian.models import UserObjectPermission
from guardian.shortcuts import assign_perm
from documents.management.commands import document_exporter
from documents.models import Correspondent
@@ -34,6 +41,8 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.addCleanup(shutil.rmtree, self.target)
self.user = User.objects.create(username="temp_admin")
self.user2 = User.objects.create(username="user2")
self.group1 = Group.objects.create(name="group1")
self.d1 = Document.objects.create(
content="Content",
@@ -73,6 +82,9 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
user=self.user,
)
assign_perm("view_document", self.user2, self.d2)
assign_perm("view_document", self.group1, self.d3)
self.t1 = Tag.objects.create(name="t")
self.dt1 = DocumentType.objects.create(name="dt")
self.c1 = Correspondent.objects.create(name="c")
@@ -141,12 +153,12 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
manifest = self._do_export(use_filename_format=use_filename_format)
self.assertEqual(len(manifest), 10)
self.assertEqual(len(manifest), 149)
# dont include consumer or AnonymousUser users
self.assertEqual(
len(list(filter(lambda e: e["model"] == "auth.user", manifest))),
1,
2,
)
self.assertEqual(
@@ -218,6 +230,9 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
Correspondent.objects.all().delete()
DocumentType.objects.all().delete()
Tag.objects.all().delete()
Permission.objects.all().delete()
UserObjectPermission.objects.all().delete()
GroupObjectPermission.objects.all().delete()
self.assertEqual(Document.objects.count(), 0)
call_command("document_importer", "--no-progress-bar", self.target)
@@ -230,6 +245,9 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertEqual(Document.objects.get(id=self.d2.id).title, "wow2")
self.assertEqual(Document.objects.get(id=self.d3.id).title, "wow2")
self.assertEqual(Document.objects.get(id=self.d4.id).title, "wow_dec")
self.assertEqual(GroupObjectPermission.objects.count(), 1)
self.assertEqual(UserObjectPermission.objects.count(), 1)
self.assertEqual(Permission.objects.count(), 108)
messages = check_sanity()
# everything is alright after the test
self.assertEqual(len(messages), 0)
@@ -259,7 +277,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
st_mtime_1 = os.stat(os.path.join(self.target, "manifest.json")).st_mtime
with mock.patch(
"documents.management.commands.document_exporter.shutil.copy2",
"documents.management.commands.document_exporter.copy_file_with_basic_stats",
) as m:
self._do_export()
m.assert_not_called()
@@ -270,7 +288,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
Path(self.d1.source_path).touch()
with mock.patch(
"documents.management.commands.document_exporter.shutil.copy2",
"documents.management.commands.document_exporter.copy_file_with_basic_stats",
) as m:
self._do_export()
self.assertEqual(m.call_count, 1)
@@ -293,7 +311,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertIsFile(os.path.join(self.target, "manifest.json"))
with mock.patch(
"documents.management.commands.document_exporter.shutil.copy2",
"documents.management.commands.document_exporter.copy_file_with_basic_stats",
) as m:
self._do_export()
m.assert_not_called()
@@ -304,7 +322,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.d2.save()
with mock.patch(
"documents.management.commands.document_exporter.shutil.copy2",
"documents.management.commands.document_exporter.copy_file_with_basic_stats",
) as m:
self._do_export(compare_checksums=True)
self.assertEqual(m.call_count, 1)
@@ -641,3 +659,47 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertEqual(Document.objects.count(), 0)
call_command("document_importer", "--no-progress-bar", self.target)
self.assertEqual(Document.objects.count(), 4)
def test_import_db_transaction_failed(self):
"""
GIVEN:
- Import from manifest started
WHEN:
- Import of database fails
THEN:
- ContentType & Permission objects are not deleted, db transaction rolled back
"""
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
shutil.copytree(
os.path.join(os.path.dirname(__file__), "samples", "documents"),
os.path.join(self.dirs.media_dir, "documents"),
)
self.assertEqual(ContentType.objects.count(), 27)
self.assertEqual(Permission.objects.count(), 108)
manifest = self._do_export()
with paperless_environment():
self.assertEqual(
len(list(filter(lambda e: e["model"] == "auth.permission", manifest))),
108,
)
# add 1 more to db to show objects are not re-created by import
Permission.objects.create(
name="test",
codename="test_perm",
content_type_id=1,
)
self.assertEqual(Permission.objects.count(), 109)
# will cause an import error
self.user.delete()
self.user = User.objects.create(username="temp_admin")
with self.assertRaises(IntegrityError):
call_command("document_importer", "--no-progress-bar", self.target)
self.assertEqual(ContentType.objects.count(), 27)
self.assertEqual(Permission.objects.count(), 109)

View File

@@ -2,6 +2,7 @@ import hashlib
import os
import shutil
from pathlib import Path
from typing import Optional
from unittest import mock
from django.conf import settings
@@ -60,8 +61,8 @@ def make_test_document(
mime_type: str,
original: str,
original_filename: str,
archive: str = None,
archive_filename: str = None,
archive: Optional[str] = None,
archive_filename: Optional[str] = None,
):
doc = document_class()
doc.filename = original_filename

View File

@@ -0,0 +1,276 @@
import shutil
import tempfile
from pathlib import Path
from typing import Callable
from typing import Iterable
from typing import Union
from unittest import mock
from django.test import override_settings
from documents.tests.utils import TestMigrations
@override_settings(PASSPHRASE="test")
@mock.patch(
"documents.migrations.1037_webp_encrypted_thumbnail_conversion.multiprocessing.pool.Pool.map",
)
@mock.patch("documents.migrations.1037_webp_encrypted_thumbnail_conversion.run_convert")
class TestMigrateToEncrytpedWebPThumbnails(TestMigrations):
migrate_from = "1036_alter_savedviewfilterrule_rule_type"
migrate_to = "1037_webp_encrypted_thumbnail_conversion"
auto_migrate = False
def pretend_convert_output(self, *args, **kwargs):
"""
Pretends to do the conversion, by copying the input file
to the output file
"""
shutil.copy2(
Path(kwargs["input_file"].rstrip("[0]")),
Path(kwargs["output_file"]),
)
def pretend_map(self, func: Callable, iterable: Iterable):
"""
Pretends to be the map of a multiprocessing.Pool, but secretly does
everything in series
"""
for item in iterable:
func(item)
def create_dummy_thumbnails(
self,
thumb_dir: Path,
ext: str,
count: int,
start_count: int = 0,
):
"""
Helper to create a certain count of files of given extension in a given directory
"""
for idx in range(count):
(Path(thumb_dir) / Path(f"{start_count + idx:07}.{ext}")).touch()
# Triple check expected files exist
self.assert_file_count_by_extension(ext, thumb_dir, count)
def create_webp_thumbnail_files(
self,
thumb_dir: Path,
count: int,
start_count: int = 0,
):
"""
Creates a dummy WebP thumbnail file in the given directory, based on
the database Document
"""
self.create_dummy_thumbnails(thumb_dir, "webp", count, start_count)
def create_encrypted_webp_thumbnail_files(
self,
thumb_dir: Path,
count: int,
start_count: int = 0,
):
"""
Creates a dummy encrypted WebP thumbnail file in the given directory, based on
the database Document
"""
self.create_dummy_thumbnails(thumb_dir, "webp.gpg", count, start_count)
def create_png_thumbnail_files(
self,
thumb_dir: Path,
count: int,
start_count: int = 0,
):
"""
Creates a dummy PNG thumbnail file in the given directory, based on
the database Document
"""
self.create_dummy_thumbnails(thumb_dir, "png", count, start_count)
def create_encrypted_png_thumbnail_files(
self,
thumb_dir: Path,
count: int,
start_count: int = 0,
):
"""
Creates a dummy encrypted PNG thumbnail file in the given directory, based on
the database Document
"""
self.create_dummy_thumbnails(thumb_dir, "png.gpg", count, start_count)
def assert_file_count_by_extension(
self,
ext: str,
dir: Union[str, Path],
expected_count: int,
):
"""
Helper to assert a certain count of given extension files in given directory
"""
if not isinstance(dir, Path):
dir = Path(dir)
matching_files = list(dir.glob(f"*.{ext}"))
self.assertEqual(len(matching_files), expected_count)
def assert_encrypted_png_file_count(self, dir: Path, expected_count: int):
"""
Helper to assert a certain count of excrypted PNG extension files in given directory
"""
self.assert_file_count_by_extension("png.gpg", dir, expected_count)
def assert_encrypted_webp_file_count(self, dir: Path, expected_count: int):
"""
Helper to assert a certain count of encrypted WebP extension files in given directory
"""
self.assert_file_count_by_extension("webp.gpg", dir, expected_count)
def assert_webp_file_count(self, dir: Path, expected_count: int):
"""
Helper to assert a certain count of WebP extension files in given directory
"""
self.assert_file_count_by_extension("webp", dir, expected_count)
def assert_png_file_count(self, dir: Path, expected_count: int):
"""
Helper to assert a certain count of PNG extension files in given directory
"""
self.assert_file_count_by_extension("png", dir, expected_count)
def setUp(self):
self.thumbnail_dir = Path(tempfile.mkdtemp()).resolve()
return super().setUp()
def tearDown(self) -> None:
shutil.rmtree(self.thumbnail_dir)
return super().tearDown()
def test_do_nothing_if_converted(
self,
run_convert_mock: mock.MagicMock,
map_mock: mock.MagicMock,
):
"""
GIVEN:
- Encrytped document exists with existing encrypted WebP thumbnail path
WHEN:
- Migration is attempted
THEN:
- Nothing is converted
"""
map_mock.side_effect = self.pretend_map
with override_settings(
THUMBNAIL_DIR=self.thumbnail_dir,
):
self.create_encrypted_webp_thumbnail_files(self.thumbnail_dir, 3)
self.performMigration()
run_convert_mock.assert_not_called()
self.assert_encrypted_webp_file_count(self.thumbnail_dir, 3)
def test_convert_thumbnails(
self,
run_convert_mock: mock.MagicMock,
map_mock: mock.MagicMock,
):
"""
GIVEN:
- Encrypted documents exist with PNG thumbnail
WHEN:
- Migration is attempted
THEN:
- Thumbnails are converted to webp & re-encrypted
"""
map_mock.side_effect = self.pretend_map
run_convert_mock.side_effect = self.pretend_convert_output
with override_settings(
THUMBNAIL_DIR=self.thumbnail_dir,
):
self.create_encrypted_png_thumbnail_files(self.thumbnail_dir, 3)
self.performMigration()
run_convert_mock.assert_called()
self.assertEqual(run_convert_mock.call_count, 3)
self.assert_encrypted_webp_file_count(self.thumbnail_dir, 3)
def test_convert_errors_out(
self,
run_convert_mock: mock.MagicMock,
map_mock: mock.MagicMock,
):
"""
GIVEN:
- Encrypted document exists with PNG thumbnail
WHEN:
- Migration is attempted, but raises an exception
THEN:
- Single thumbnail is converted
"""
map_mock.side_effect = self.pretend_map
run_convert_mock.side_effect = OSError
with override_settings(
THUMBNAIL_DIR=self.thumbnail_dir,
):
self.create_encrypted_png_thumbnail_files(self.thumbnail_dir, 3)
self.performMigration()
run_convert_mock.assert_called()
self.assertEqual(run_convert_mock.call_count, 3)
self.assert_encrypted_png_file_count(self.thumbnail_dir, 3)
def test_convert_mixed(
self,
run_convert_mock: mock.MagicMock,
map_mock: mock.MagicMock,
):
"""
GIVEN:
- Documents exist with PNG, encrypted PNG and WebP thumbnails
WHEN:
- Migration is attempted
THEN:
- Only encrypted PNG thumbnails are converted
"""
map_mock.side_effect = self.pretend_map
run_convert_mock.side_effect = self.pretend_convert_output
with override_settings(
THUMBNAIL_DIR=self.thumbnail_dir,
):
self.create_png_thumbnail_files(self.thumbnail_dir, 3)
self.create_encrypted_png_thumbnail_files(
self.thumbnail_dir,
3,
start_count=3,
)
self.create_webp_thumbnail_files(self.thumbnail_dir, 2, start_count=6)
self.create_encrypted_webp_thumbnail_files(
self.thumbnail_dir,
3,
start_count=8,
)
self.performMigration()
run_convert_mock.assert_called()
self.assertEqual(run_convert_mock.call_count, 3)
self.assert_png_file_count(self.thumbnail_dir, 3)
self.assert_encrypted_webp_file_count(self.thumbnail_dir, 6)
self.assert_webp_file_count(self.thumbnail_dir, 2)
self.assert_encrypted_png_file_count(self.thumbnail_dir, 0)

43
src/documents/utils.py Normal file
View File

@@ -0,0 +1,43 @@
import shutil
from os import utime
from pathlib import Path
from typing import Tuple
from typing import Union
def _coerce_to_path(
source: Union[Path, str],
dest: Union[Path, str],
) -> Tuple[Path, Path]:
return Path(source).resolve(), Path(dest).resolve()
def copy_basic_file_stats(source: Union[Path, str], dest: Union[Path, str]) -> None:
"""
Copies only the m_time and a_time attributes from source to destination.
Both are expected to exist.
The extended attribute copy does weird things with SELinux and files
copied from temporary directories and copystat doesn't allow disabling
these copies
"""
source, dest = _coerce_to_path(source, dest)
src_stat = source.stat()
utime(dest, ns=(src_stat.st_atime_ns, src_stat.st_mtime_ns))
def copy_file_with_basic_stats(
source: Union[Path, str],
dest: Union[Path, str],
) -> None:
"""
A sort of simpler copy2 that doesn't copy extended file attributes,
only the access time and modified times from source to dest.
The extended attribute copy does weird things with SELinux and files
copied from temporary directories.
"""
source, dest = _coerce_to_path(source, dest)
shutil.copy(source, dest)
copy_basic_file_stats(source, dest)

View File

@@ -502,19 +502,18 @@ class DocumentViewSet(
@action(methods=["get", "post", "delete"], detail=True)
def notes(self, request, pk=None):
currentUser = request.user
try:
doc = Document.objects.get(pk=pk)
if request.user is not None and not has_perms_owner_aware(
request.user,
if currentUser is not None and not has_perms_owner_aware(
currentUser,
"view_document",
doc,
):
return HttpResponseForbidden("Insufficient permissions")
return HttpResponseForbidden("Insufficient permissions to view")
except Document.DoesNotExist:
raise Http404
currentUser = request.user
if request.method == "GET":
try:
return Response(self.getNotes(doc))
@@ -525,6 +524,13 @@ class DocumentViewSet(
)
elif request.method == "POST":
try:
if currentUser is not None and not has_perms_owner_aware(
currentUser,
"change_document",
doc,
):
return HttpResponseForbidden("Insufficient permissions to create")
c = Note.objects.create(
document=doc,
note=request.data["note"],
@@ -545,6 +551,13 @@ class DocumentViewSet(
},
)
elif request.method == "DELETE":
if currentUser is not None and not has_perms_owner_aware(
currentUser,
"change_document",
doc,
):
return HttpResponseForbidden("Insufficient permissions to delete")
note = Note.objects.get(id=int(request.GET.get("id")))
note.delete()

View File

@@ -791,6 +791,18 @@ CONSUMER_BARCODE_DPI: Final[str] = int(
os.getenv("PAPERLESS_CONSUMER_BARCODE_DPI", 300),
)
CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED",
)
CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME: Final[str] = os.getenv(
"PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME",
"double-sided",
)
CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT: Final[bool] = __get_boolean(
"PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT",
)
OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0))

View File

@@ -1,6 +1,7 @@
from django import forms
from django.contrib import admin
from django.utils.translation import gettext_lazy as _
from guardian.admin import GuardedModelAdmin
from paperless_mail.models import MailAccount
from paperless_mail.models import MailRule
@@ -31,7 +32,7 @@ class MailAccountAdminForm(forms.ModelForm):
]
class MailAccountAdmin(admin.ModelAdmin):
class MailAccountAdmin(GuardedModelAdmin):
list_display = ("name", "imap_server", "username")
fieldsets = [
@@ -45,7 +46,7 @@ class MailAccountAdmin(admin.ModelAdmin):
form = MailAccountAdminForm
class MailRuleAdmin(admin.ModelAdmin):
class MailRuleAdmin(GuardedModelAdmin):
radio_fields = {
"attachment_type": admin.VERTICAL,
"action": admin.VERTICAL,

View File

@@ -2,6 +2,7 @@ import datetime
import itertools
import logging
import os
import ssl
import tempfile
import traceback
from datetime import date
@@ -394,13 +395,12 @@ def get_mailbox(server, port, security) -> MailBox:
"""
Returns the correct MailBox instance for the given configuration.
"""
if security == MailAccount.ImapSecurity.NONE:
mailbox = MailBoxUnencrypted(server, port)
elif security == MailAccount.ImapSecurity.STARTTLS:
mailbox = MailBoxTls(server, port)
mailbox = MailBoxTls(server, port, ssl_context=ssl.create_default_context())
elif security == MailAccount.ImapSecurity.SSL:
mailbox = MailBox(server, port)
mailbox = MailBox(server, port, ssl_context=ssl.create_default_context())
else:
raise NotImplementedError("Unknown IMAP security") # pragma: nocover
return mailbox

View File

@@ -25,7 +25,6 @@ class MailAccountSerializer(OwnedObjectSerializer):
class Meta:
model = MailAccount
depth = 1
fields = [
"id",
"name",
@@ -36,6 +35,10 @@ class MailAccountSerializer(OwnedObjectSerializer):
"password",
"character_set",
"is_token",
"owner",
"user_can_change",
"permissions",
"set_permissions",
]
def update(self, instance, validated_data):
@@ -67,7 +70,6 @@ class MailRuleSerializer(OwnedObjectSerializer):
class Meta:
model = MailRule
depth = 1
fields = [
"id",
"name",
@@ -89,6 +91,10 @@ class MailRuleSerializer(OwnedObjectSerializer):
"order",
"attachment_type",
"consumption_scope",
"owner",
"user_can_change",
"permissions",
"set_permissions",
]
def update(self, instance, validated_data):

View File

@@ -1,7 +1,9 @@
import json
from unittest import mock
from django.contrib.auth.models import Permission
from django.contrib.auth.models import User
from guardian.shortcuts import assign_perm
from rest_framework import status
from rest_framework.test import APITestCase
@@ -27,7 +29,9 @@ class TestAPIMailAccounts(DirectoriesMixin, APITestCase):
super().setUp()
self.user = User.objects.create_superuser(username="temp_admin")
self.user = User.objects.create_user(username="temp_admin")
self.user.user_permissions.add(*Permission.objects.all())
self.user.save()
self.client.force_authenticate(user=self.user)
def test_get_mail_accounts(self):
@@ -266,6 +270,73 @@ class TestAPIMailAccounts(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["success"], True)
def test_get_mail_accounts_owner_aware(self):
"""
GIVEN:
- Configured accounts with different users
WHEN:
- API call is made to get mail accounts
THEN:
- Only unowned, owned by user or granted accounts are provided
"""
user2 = User.objects.create_user(username="temp_admin2")
account1 = MailAccount.objects.create(
name="Email1",
username="username1",
password="password1",
imap_server="server.example.com",
imap_port=443,
imap_security=MailAccount.ImapSecurity.SSL,
character_set="UTF-8",
)
account2 = MailAccount.objects.create(
name="Email2",
username="username2",
password="password2",
imap_server="server.example.com",
imap_port=443,
imap_security=MailAccount.ImapSecurity.SSL,
character_set="UTF-8",
)
account2.owner = self.user
account2.save()
account3 = MailAccount.objects.create(
name="Email3",
username="username3",
password="password3",
imap_server="server.example.com",
imap_port=443,
imap_security=MailAccount.ImapSecurity.SSL,
character_set="UTF-8",
)
account3.owner = user2
account3.save()
account4 = MailAccount.objects.create(
name="Email4",
username="username4",
password="password4",
imap_server="server.example.com",
imap_port=443,
imap_security=MailAccount.ImapSecurity.SSL,
character_set="UTF-8",
)
account4.owner = user2
account4.save()
assign_perm("view_mailaccount", self.user, account4)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 3)
self.assertEqual(response.data["results"][0]["name"], account1.name)
self.assertEqual(response.data["results"][1]["name"], account2.name)
self.assertEqual(response.data["results"][2]["name"], account4.name)
class TestAPIMailRules(DirectoriesMixin, APITestCase):
ENDPOINT = "/api/mail_rules/"
@@ -273,7 +344,9 @@ class TestAPIMailRules(DirectoriesMixin, APITestCase):
def setUp(self):
super().setUp()
self.user = User.objects.create_superuser(username="temp_admin")
self.user = User.objects.create_user(username="temp_admin")
self.user.user_permissions.add(*Permission.objects.all())
self.user.save()
self.client.force_authenticate(user=self.user)
def test_get_mail_rules(self):
@@ -533,3 +606,72 @@ class TestAPIMailRules(DirectoriesMixin, APITestCase):
returned_rule1 = MailRule.objects.get(pk=rule1.pk)
self.assertEqual(returned_rule1.name, "Updated Name 1")
self.assertEqual(returned_rule1.action, MailRule.MailAction.DELETE)
def test_get_mail_rules_owner_aware(self):
"""
GIVEN:
- Configured rules with different users
WHEN:
- API call is made to get mail rules
THEN:
- Only unowned, owned by user or granted mail rules are provided
"""
user2 = User.objects.create_user(username="temp_admin2")
account1 = MailAccount.objects.create(
name="Email1",
username="username1",
password="password1",
imap_server="server.example.com",
imap_port=443,
imap_security=MailAccount.ImapSecurity.SSL,
character_set="UTF-8",
)
rule1 = MailRule.objects.create(
name="Rule1",
account=account1,
folder="INBOX",
filter_from="from@example1.com",
order=0,
)
rule2 = MailRule.objects.create(
name="Rule2",
account=account1,
folder="INBOX",
filter_from="from@example2.com",
order=1,
)
rule2.owner = self.user
rule2.save()
rule3 = MailRule.objects.create(
name="Rule3",
account=account1,
folder="INBOX",
filter_from="from@example3.com",
order=2,
)
rule3.owner = user2
rule3.save()
rule4 = MailRule.objects.create(
name="Rule4",
account=account1,
folder="INBOX",
filter_from="from@example4.com",
order=3,
)
rule4.owner = user2
rule4.save()
assign_perm("view_mailrule", self.user, rule4)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertEqual(response.data["count"], 3)
self.assertEqual(response.data["results"][0]["name"], rule1.name)
self.assertEqual(response.data["results"][1]["name"], rule2.name)
self.assertEqual(response.data["results"][2]["name"], rule4.name)

View File

@@ -7,6 +7,8 @@ from rest_framework.permissions import IsAuthenticated
from rest_framework.response import Response
from rest_framework.viewsets import ModelViewSet
from documents.filters import ObjectOwnedOrGrantedPermissionsFilter
from documents.permissions import PaperlessObjectPermissions
from documents.views import PassUserMixin
from paperless.views import StandardPagination
from paperless_mail.mail import MailError
@@ -24,7 +26,8 @@ class MailAccountViewSet(ModelViewSet, PassUserMixin):
queryset = MailAccount.objects.all().order_by("pk")
serializer_class = MailAccountSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
permission_classes = (IsAuthenticated, PaperlessObjectPermissions)
filter_backends = (ObjectOwnedOrGrantedPermissionsFilter,)
class MailRuleViewSet(ModelViewSet, PassUserMixin):
@@ -33,7 +36,8 @@ class MailRuleViewSet(ModelViewSet, PassUserMixin):
queryset = MailRule.objects.all().order_by("order")
serializer_class = MailRuleSerializer
pagination_class = StandardPagination
permission_classes = (IsAuthenticated,)
permission_classes = (IsAuthenticated, PaperlessObjectPermissions)
filter_backends = (ObjectOwnedOrGrantedPermissionsFilter,)
class MailAccountTestView(GenericAPIView):

View File

@@ -861,8 +861,9 @@ class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "document.webp"), "image/webp")
self.assertIsFile(parser.archive_path)
# OCR consistent mangles this space, oh well
self.assertIn(
"this is awebp document, created 11/14/2022.",
# Older tesseracts consistently mangle the space between "a webp",
# tesseract 5.3.0 seems to do a better job, so we're accepting both
self.assertRegex(
parser.get_text().lower(),
r"this is a ?webp document, created 11/14/2022.",
)