mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-30 18:27:45 -05:00
Merge remote-tracking branch 'paperless-ngx/dev' into dev
This commit is contained in:
@@ -1,15 +1,12 @@
|
||||
import logging
|
||||
import shutil
|
||||
import tempfile
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from subprocess import run
|
||||
from typing import Dict
|
||||
from typing import Final
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
|
||||
import img2pdf
|
||||
from django.conf import settings
|
||||
from pdf2image import convert_from_path
|
||||
from pdf2image.exceptions import PDFPageCountError
|
||||
@@ -17,7 +14,10 @@ from pikepdf import Page
|
||||
from pikepdf import Pdf
|
||||
from PIL import Image
|
||||
|
||||
from documents.converters import convert_from_tiff_to_pdf
|
||||
from documents.data_models import DocumentSource
|
||||
from documents.utils import copy_basic_file_stats
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
|
||||
logger = logging.getLogger("paperless.barcodes")
|
||||
|
||||
@@ -54,7 +54,7 @@ class BarcodeReader:
|
||||
self.mime: Final[str] = mime_type
|
||||
self.pdf_file: Path = self.file
|
||||
self.barcodes: List[Barcode] = []
|
||||
self.temp_dir: Optional[Path] = None
|
||||
self.temp_dir: Optional[tempfile.TemporaryDirectory] = None
|
||||
|
||||
if settings.CONSUMER_BARCODE_TIFF_SUPPORT:
|
||||
self.SUPPORTED_FILE_MIMES = {"application/pdf", "image/tiff"}
|
||||
@@ -154,34 +154,7 @@ class BarcodeReader:
|
||||
if self.mime != "image/tiff":
|
||||
return
|
||||
|
||||
with Image.open(self.file) as im:
|
||||
has_alpha_layer = im.mode in ("RGBA", "LA")
|
||||
if has_alpha_layer:
|
||||
# Note the save into the temp folder, so as not to trigger a new
|
||||
# consume
|
||||
scratch_image = Path(self.temp_dir.name) / Path(self.file.name)
|
||||
run(
|
||||
[
|
||||
settings.CONVERT_BINARY,
|
||||
"-alpha",
|
||||
"off",
|
||||
self.file,
|
||||
scratch_image,
|
||||
],
|
||||
)
|
||||
else:
|
||||
# Not modifying the original, safe to use in place
|
||||
scratch_image = self.file
|
||||
|
||||
self.pdf_file = Path(self.temp_dir.name) / Path(self.file.name).with_suffix(
|
||||
".pdf",
|
||||
)
|
||||
|
||||
with scratch_image.open("rb") as img_file, self.pdf_file.open("wb") as pdf_file:
|
||||
pdf_file.write(img2pdf.convert(img_file))
|
||||
|
||||
# Copy what file stat is possible
|
||||
shutil.copystat(self.file, self.pdf_file)
|
||||
self.pdf_file = convert_from_tiff_to_pdf(self.file, Path(self.temp_dir.name))
|
||||
|
||||
def detect(self) -> None:
|
||||
"""
|
||||
@@ -306,7 +279,7 @@ class BarcodeReader:
|
||||
with open(savepath, "wb") as out:
|
||||
dst.save(out)
|
||||
|
||||
shutil.copystat(self.file, savepath)
|
||||
copy_basic_file_stats(self.file, savepath)
|
||||
|
||||
document_paths.append(savepath)
|
||||
|
||||
@@ -363,5 +336,5 @@ class BarcodeReader:
|
||||
else:
|
||||
dest = save_to_dir
|
||||
logger.info(f"Saving {document_path} to {dest}")
|
||||
shutil.copy2(document_path, dest)
|
||||
copy_file_with_basic_stats(document_path, dest)
|
||||
return True
|
||||
|
@@ -5,6 +5,7 @@ import re
|
||||
import warnings
|
||||
from datetime import datetime
|
||||
from hashlib import sha256
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
from typing import List
|
||||
from typing import Optional
|
||||
@@ -81,7 +82,7 @@ class DocumentClassifier:
|
||||
self._stemmer = None
|
||||
self._stop_words = None
|
||||
|
||||
def load(self):
|
||||
def load(self) -> None:
|
||||
# Catch warnings for processing
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
with open(settings.MODEL_FILE, "rb") as f:
|
||||
@@ -120,19 +121,20 @@ class DocumentClassifier:
|
||||
raise IncompatibleClassifierVersionError
|
||||
|
||||
def save(self):
|
||||
target_file = settings.MODEL_FILE
|
||||
target_file_temp = settings.MODEL_FILE.with_suffix(".pickle.part")
|
||||
target_file: Path = settings.MODEL_FILE
|
||||
target_file_temp = target_file.with_suffix(".pickle.part")
|
||||
|
||||
with open(target_file_temp, "wb") as f:
|
||||
pickle.dump(self.FORMAT_VERSION, f)
|
||||
|
||||
pickle.dump(self.last_doc_change_time, f)
|
||||
pickle.dump(self.last_auto_type_hash, f)
|
||||
|
||||
pickle.dump(self.data_vectorizer, f)
|
||||
|
||||
pickle.dump(self.tags_binarizer, f)
|
||||
|
||||
pickle.dump(self.tags_classifier, f)
|
||||
|
||||
pickle.dump(self.correspondent_classifier, f)
|
||||
pickle.dump(self.document_type_classifier, f)
|
||||
pickle.dump(self.storage_path_classifier, f)
|
||||
@@ -247,7 +249,7 @@ class DocumentClassifier:
|
||||
data_vectorized = self.data_vectorizer.fit_transform(content_generator())
|
||||
|
||||
# See the notes here:
|
||||
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html # noqa: 501
|
||||
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html # noqa: E501
|
||||
# This attribute isn't needed to function and can be large
|
||||
self.data_vectorizer.stop_words_ = None
|
||||
|
||||
@@ -380,7 +382,7 @@ class DocumentClassifier:
|
||||
|
||||
return content
|
||||
|
||||
def predict_correspondent(self, content: str):
|
||||
def predict_correspondent(self, content: str) -> Optional[int]:
|
||||
if self.correspondent_classifier:
|
||||
X = self.data_vectorizer.transform([self.preprocess_content(content)])
|
||||
correspondent_id = self.correspondent_classifier.predict(X)
|
||||
@@ -391,7 +393,7 @@ class DocumentClassifier:
|
||||
else:
|
||||
return None
|
||||
|
||||
def predict_document_type(self, content: str):
|
||||
def predict_document_type(self, content: str) -> Optional[int]:
|
||||
if self.document_type_classifier:
|
||||
X = self.data_vectorizer.transform([self.preprocess_content(content)])
|
||||
document_type_id = self.document_type_classifier.predict(X)
|
||||
@@ -402,7 +404,7 @@ class DocumentClassifier:
|
||||
else:
|
||||
return None
|
||||
|
||||
def predict_tags(self, content: str):
|
||||
def predict_tags(self, content: str) -> List[int]:
|
||||
from sklearn.utils.multiclass import type_of_target
|
||||
|
||||
if self.tags_classifier:
|
||||
@@ -423,7 +425,7 @@ class DocumentClassifier:
|
||||
else:
|
||||
return []
|
||||
|
||||
def predict_storage_path(self, content: str):
|
||||
def predict_storage_path(self, content: str) -> Optional[int]:
|
||||
if self.storage_path_classifier:
|
||||
X = self.data_vectorizer.transform([self.preprocess_content(content)])
|
||||
storage_path_id = self.storage_path_classifier.predict(X)
|
||||
|
@@ -1,9 +1,9 @@
|
||||
import datetime
|
||||
import hashlib
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import uuid
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from subprocess import CompletedProcess
|
||||
from subprocess import run
|
||||
@@ -21,6 +21,9 @@ from django.utils import timezone
|
||||
from filelock import FileLock
|
||||
from rest_framework.reverse import reverse
|
||||
|
||||
from documents.utils import copy_basic_file_stats
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
|
||||
from .classifier import load_classifier
|
||||
from .file_handling import create_source_path_directory
|
||||
from .file_handling import generate_unique_filename
|
||||
@@ -42,21 +45,30 @@ class ConsumerError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
MESSAGE_DOCUMENT_ALREADY_EXISTS = "document_already_exists"
|
||||
MESSAGE_ASN_ALREADY_EXISTS = "asn_already_exists"
|
||||
MESSAGE_ASN_RANGE = "asn_value_out_of_range"
|
||||
MESSAGE_FILE_NOT_FOUND = "file_not_found"
|
||||
MESSAGE_PRE_CONSUME_SCRIPT_NOT_FOUND = "pre_consume_script_not_found"
|
||||
MESSAGE_PRE_CONSUME_SCRIPT_ERROR = "pre_consume_script_error"
|
||||
MESSAGE_POST_CONSUME_SCRIPT_NOT_FOUND = "post_consume_script_not_found"
|
||||
MESSAGE_POST_CONSUME_SCRIPT_ERROR = "post_consume_script_error"
|
||||
MESSAGE_NEW_FILE = "new_file"
|
||||
MESSAGE_UNSUPPORTED_TYPE = "unsupported_type"
|
||||
MESSAGE_PARSING_DOCUMENT = "parsing_document"
|
||||
MESSAGE_GENERATING_THUMBNAIL = "generating_thumbnail"
|
||||
MESSAGE_PARSE_DATE = "parse_date"
|
||||
MESSAGE_SAVE_DOCUMENT = "save_document"
|
||||
MESSAGE_FINISHED = "finished"
|
||||
class ConsumerStatusShortMessage(str, Enum):
|
||||
DOCUMENT_ALREADY_EXISTS = "document_already_exists"
|
||||
ASN_ALREADY_EXISTS = "asn_already_exists"
|
||||
ASN_RANGE = "asn_value_out_of_range"
|
||||
FILE_NOT_FOUND = "file_not_found"
|
||||
PRE_CONSUME_SCRIPT_NOT_FOUND = "pre_consume_script_not_found"
|
||||
PRE_CONSUME_SCRIPT_ERROR = "pre_consume_script_error"
|
||||
POST_CONSUME_SCRIPT_NOT_FOUND = "post_consume_script_not_found"
|
||||
POST_CONSUME_SCRIPT_ERROR = "post_consume_script_error"
|
||||
NEW_FILE = "new_file"
|
||||
UNSUPPORTED_TYPE = "unsupported_type"
|
||||
PARSING_DOCUMENT = "parsing_document"
|
||||
GENERATING_THUMBNAIL = "generating_thumbnail"
|
||||
PARSE_DATE = "parse_date"
|
||||
SAVE_DOCUMENT = "save_document"
|
||||
FINISHED = "finished"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class ConsumerFilePhase(str, Enum):
|
||||
STARTED = "STARTED"
|
||||
WORKING = "WORKING"
|
||||
SUCCESS = "SUCCESS"
|
||||
FAILED = "FAILED"
|
||||
|
||||
|
||||
class Consumer(LoggingMixin):
|
||||
@@ -64,10 +76,10 @@ class Consumer(LoggingMixin):
|
||||
|
||||
def _send_progress(
|
||||
self,
|
||||
current_progress,
|
||||
max_progress,
|
||||
status,
|
||||
message=None,
|
||||
current_progress: int,
|
||||
max_progress: int,
|
||||
status: ConsumerFilePhase,
|
||||
message: Optional[ConsumerStatusShortMessage] = None,
|
||||
document_id=None,
|
||||
): # pragma: no cover
|
||||
payload = {
|
||||
@@ -86,12 +98,12 @@ class Consumer(LoggingMixin):
|
||||
|
||||
def _fail(
|
||||
self,
|
||||
message,
|
||||
log_message=None,
|
||||
message: ConsumerStatusShortMessage,
|
||||
log_message: Optional[str] = None,
|
||||
exc_info=None,
|
||||
exception: Optional[Exception] = None,
|
||||
):
|
||||
self._send_progress(100, 100, "FAILED", message)
|
||||
self._send_progress(100, 100, ConsumerFilePhase.FAILED, message)
|
||||
self.log.error(log_message or message, exc_info=exc_info)
|
||||
raise ConsumerError(f"{self.filename}: {log_message or message}") from exception
|
||||
|
||||
@@ -111,13 +123,19 @@ class Consumer(LoggingMixin):
|
||||
self.channel_layer = get_channel_layer()
|
||||
|
||||
def pre_check_file_exists(self):
|
||||
"""
|
||||
Confirm the input file still exists where it should
|
||||
"""
|
||||
if not os.path.isfile(self.path):
|
||||
self._fail(
|
||||
MESSAGE_FILE_NOT_FOUND,
|
||||
ConsumerStatusShortMessage.FILE_NOT_FOUND,
|
||||
f"Cannot consume {self.path}: File not found.",
|
||||
)
|
||||
|
||||
def pre_check_duplicate(self):
|
||||
"""
|
||||
Using the MD5 of the file, check this exact file doesn't already exist
|
||||
"""
|
||||
with open(self.path, "rb") as f:
|
||||
checksum = hashlib.md5(f.read()).hexdigest()
|
||||
existing_doc = Document.objects.filter(
|
||||
@@ -127,12 +145,15 @@ class Consumer(LoggingMixin):
|
||||
if settings.CONSUMER_DELETE_DUPLICATES:
|
||||
os.unlink(self.path)
|
||||
self._fail(
|
||||
MESSAGE_DOCUMENT_ALREADY_EXISTS,
|
||||
ConsumerStatusShortMessage.DOCUMENT_ALREADY_EXISTS,
|
||||
f"Not consuming {self.filename}: It is a duplicate of"
|
||||
f" {existing_doc.get().title} (#{existing_doc.get().pk})",
|
||||
)
|
||||
|
||||
def pre_check_directories(self):
|
||||
"""
|
||||
Ensure all required directories exist before attempting to use them
|
||||
"""
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
os.makedirs(settings.THUMBNAIL_DIR, exist_ok=True)
|
||||
os.makedirs(settings.ORIGINALS_DIR, exist_ok=True)
|
||||
@@ -152,7 +173,7 @@ class Consumer(LoggingMixin):
|
||||
or self.override_asn > Document.ARCHIVE_SERIAL_NUMBER_MAX
|
||||
):
|
||||
self._fail(
|
||||
MESSAGE_ASN_RANGE,
|
||||
ConsumerStatusShortMessage.ASN_RANGE,
|
||||
f"Not consuming {self.filename}: "
|
||||
f"Given ASN {self.override_asn} is out of range "
|
||||
f"[{Document.ARCHIVE_SERIAL_NUMBER_MIN:,}, "
|
||||
@@ -160,17 +181,21 @@ class Consumer(LoggingMixin):
|
||||
)
|
||||
if Document.objects.filter(archive_serial_number=self.override_asn).exists():
|
||||
self._fail(
|
||||
MESSAGE_ASN_ALREADY_EXISTS,
|
||||
ConsumerStatusShortMessage.ASN_ALREADY_EXISTS,
|
||||
f"Not consuming {self.filename}: Given ASN already exists!",
|
||||
)
|
||||
|
||||
def run_pre_consume_script(self):
|
||||
"""
|
||||
If one is configured and exists, run the pre-consume script and
|
||||
handle its output and/or errors
|
||||
"""
|
||||
if not settings.PRE_CONSUME_SCRIPT:
|
||||
return
|
||||
|
||||
if not os.path.isfile(settings.PRE_CONSUME_SCRIPT):
|
||||
self._fail(
|
||||
MESSAGE_PRE_CONSUME_SCRIPT_NOT_FOUND,
|
||||
ConsumerStatusShortMessage.PRE_CONSUME_SCRIPT_NOT_FOUND,
|
||||
f"Configured pre-consume script "
|
||||
f"{settings.PRE_CONSUME_SCRIPT} does not exist.",
|
||||
)
|
||||
@@ -201,19 +226,23 @@ class Consumer(LoggingMixin):
|
||||
|
||||
except Exception as e:
|
||||
self._fail(
|
||||
MESSAGE_PRE_CONSUME_SCRIPT_ERROR,
|
||||
ConsumerStatusShortMessage.PRE_CONSUME_SCRIPT_ERROR,
|
||||
f"Error while executing pre-consume script: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
)
|
||||
|
||||
def run_post_consume_script(self, document: Document):
|
||||
"""
|
||||
If one is configured and exists, run the pre-consume script and
|
||||
handle its output and/or errors
|
||||
"""
|
||||
if not settings.POST_CONSUME_SCRIPT:
|
||||
return
|
||||
|
||||
if not os.path.isfile(settings.POST_CONSUME_SCRIPT):
|
||||
self._fail(
|
||||
MESSAGE_POST_CONSUME_SCRIPT_NOT_FOUND,
|
||||
ConsumerStatusShortMessage.POST_CONSUME_SCRIPT_NOT_FOUND,
|
||||
f"Configured post-consume script "
|
||||
f"{settings.POST_CONSUME_SCRIPT} does not exist.",
|
||||
)
|
||||
@@ -274,7 +303,7 @@ class Consumer(LoggingMixin):
|
||||
|
||||
except Exception as e:
|
||||
self._fail(
|
||||
MESSAGE_POST_CONSUME_SCRIPT_ERROR,
|
||||
ConsumerStatusShortMessage.POST_CONSUME_SCRIPT_ERROR,
|
||||
f"Error while executing post-consume script: {e}",
|
||||
exc_info=True,
|
||||
exception=e,
|
||||
@@ -308,7 +337,12 @@ class Consumer(LoggingMixin):
|
||||
self.override_asn = override_asn
|
||||
self.override_owner_id = override_owner_id
|
||||
|
||||
self._send_progress(0, 100, "STARTING", MESSAGE_NEW_FILE)
|
||||
self._send_progress(
|
||||
0,
|
||||
100,
|
||||
ConsumerFilePhase.STARTED,
|
||||
ConsumerStatusShortMessage.NEW_FILE,
|
||||
)
|
||||
|
||||
# Make sure that preconditions for consuming the file are met.
|
||||
|
||||
@@ -326,7 +360,7 @@ class Consumer(LoggingMixin):
|
||||
dir=settings.SCRATCH_DIR,
|
||||
)
|
||||
self.path = Path(tempdir.name) / Path(self.filename)
|
||||
shutil.copy2(self.original_path, self.path)
|
||||
copy_file_with_basic_stats(self.original_path, self.path)
|
||||
|
||||
# Determine the parser class.
|
||||
|
||||
@@ -340,7 +374,10 @@ class Consumer(LoggingMixin):
|
||||
)
|
||||
if not parser_class:
|
||||
tempdir.cleanup()
|
||||
self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}")
|
||||
self._fail(
|
||||
ConsumerStatusShortMessage.UNSUPPORTED_TYPE,
|
||||
f"Unsupported mime type {mime_type}",
|
||||
)
|
||||
|
||||
# Notify all listeners that we're going to do some work.
|
||||
|
||||
@@ -355,7 +392,7 @@ class Consumer(LoggingMixin):
|
||||
def progress_callback(current_progress, max_progress): # pragma: no cover
|
||||
# recalculate progress to be within 20 and 80
|
||||
p = int((current_progress / max_progress) * 50 + 20)
|
||||
self._send_progress(p, 100, "WORKING")
|
||||
self._send_progress(p, 100, ConsumerFilePhase.WORKING)
|
||||
|
||||
# This doesn't parse the document yet, but gives us a parser.
|
||||
|
||||
@@ -377,12 +414,22 @@ class Consumer(LoggingMixin):
|
||||
archive_path = None
|
||||
|
||||
try:
|
||||
self._send_progress(20, 100, "WORKING", MESSAGE_PARSING_DOCUMENT)
|
||||
self._send_progress(
|
||||
20,
|
||||
100,
|
||||
ConsumerFilePhase.WORKING,
|
||||
ConsumerStatusShortMessage.PARSING_DOCUMENT,
|
||||
)
|
||||
self.log.debug(f"Parsing {self.filename}...")
|
||||
document_parser.parse(self.path, mime_type, self.filename)
|
||||
|
||||
self.log.debug(f"Generating thumbnail for {self.filename}...")
|
||||
self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL)
|
||||
self._send_progress(
|
||||
70,
|
||||
100,
|
||||
ConsumerFilePhase.WORKING,
|
||||
ConsumerStatusShortMessage.GENERATING_THUMBNAIL,
|
||||
)
|
||||
thumbnail = document_parser.get_thumbnail(
|
||||
self.path,
|
||||
mime_type,
|
||||
@@ -392,7 +439,12 @@ class Consumer(LoggingMixin):
|
||||
text = document_parser.get_text()
|
||||
date = document_parser.get_date()
|
||||
if date is None:
|
||||
self._send_progress(90, 100, "WORKING", MESSAGE_PARSE_DATE)
|
||||
self._send_progress(
|
||||
90,
|
||||
100,
|
||||
ConsumerFilePhase.WORKING,
|
||||
ConsumerStatusShortMessage.PARSE_DATE,
|
||||
)
|
||||
date = parse_date(self.filename, text)
|
||||
archive_path = document_parser.get_archive_path()
|
||||
|
||||
@@ -414,7 +466,12 @@ class Consumer(LoggingMixin):
|
||||
|
||||
classifier = load_classifier()
|
||||
|
||||
self._send_progress(95, 100, "WORKING", MESSAGE_SAVE_DOCUMENT)
|
||||
self._send_progress(
|
||||
95,
|
||||
100,
|
||||
ConsumerFilePhase.WORKING,
|
||||
ConsumerStatusShortMessage.SAVE_DOCUMENT,
|
||||
)
|
||||
# now that everything is done, we can start to store the document
|
||||
# in the system. This will be a transaction and reasonably fast.
|
||||
try:
|
||||
@@ -499,7 +556,13 @@ class Consumer(LoggingMixin):
|
||||
|
||||
self.log.info(f"Document {document} consumption finished")
|
||||
|
||||
self._send_progress(100, 100, "SUCCESS", MESSAGE_FINISHED, document.id)
|
||||
self._send_progress(
|
||||
100,
|
||||
100,
|
||||
ConsumerFilePhase.SUCCESS,
|
||||
ConsumerStatusShortMessage.FINISHED,
|
||||
document.id,
|
||||
)
|
||||
|
||||
# Return the most up to date fields
|
||||
document.refresh_from_db()
|
||||
@@ -585,7 +648,7 @@ class Consumer(LoggingMixin):
|
||||
|
||||
# Attempt to copy file's original stats, but it's ok if we can't
|
||||
try:
|
||||
shutil.copystat(source, target)
|
||||
copy_basic_file_stats(source, target)
|
||||
except Exception: # pragma: no cover
|
||||
pass
|
||||
|
||||
|
46
src/documents/converters.py
Normal file
46
src/documents/converters.py
Normal file
@@ -0,0 +1,46 @@
|
||||
from pathlib import Path
|
||||
from subprocess import run
|
||||
|
||||
import img2pdf
|
||||
from django.conf import settings
|
||||
from PIL import Image
|
||||
|
||||
from documents.utils import copy_basic_file_stats
|
||||
|
||||
|
||||
def convert_from_tiff_to_pdf(tiff_path: Path, target_directory: Path) -> Path:
|
||||
"""
|
||||
Converts a TIFF file into a PDF file.
|
||||
|
||||
The PDF will be created in the given target_directory and share the name of
|
||||
the original TIFF file, as well as its stats (mtime etc.).
|
||||
|
||||
Returns the path of the PDF created.
|
||||
"""
|
||||
with Image.open(tiff_path) as im:
|
||||
has_alpha_layer = im.mode in ("RGBA", "LA")
|
||||
if has_alpha_layer:
|
||||
# Note the save into the temp folder, so as not to trigger a new
|
||||
# consume
|
||||
scratch_image = target_directory / tiff_path.name
|
||||
run(
|
||||
[
|
||||
settings.CONVERT_BINARY,
|
||||
"-alpha",
|
||||
"off",
|
||||
tiff_path,
|
||||
scratch_image,
|
||||
],
|
||||
)
|
||||
else:
|
||||
# Not modifying the original, safe to use in place
|
||||
scratch_image = tiff_path
|
||||
|
||||
pdf_path = (target_directory / tiff_path.name).with_suffix(".pdf")
|
||||
|
||||
with scratch_image.open("rb") as img_file, pdf_path.open("wb") as pdf_file:
|
||||
pdf_file.write(img2pdf.convert(img_file))
|
||||
|
||||
# Copy what file stat is possible
|
||||
copy_basic_file_stats(tiff_path, pdf_path)
|
||||
return pdf_path
|
131
src/documents/double_sided.py
Normal file
131
src/documents/double_sided.py
Normal file
@@ -0,0 +1,131 @@
|
||||
import datetime as dt
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
from pikepdf import Pdf
|
||||
|
||||
from documents.consumer import ConsumerError
|
||||
from documents.converters import convert_from_tiff_to_pdf
|
||||
from documents.data_models import ConsumableDocument
|
||||
|
||||
logger = logging.getLogger("paperless.double_sided")
|
||||
|
||||
# Hardcoded for now, could be made a configurable setting if needed
|
||||
TIMEOUT_MINUTES = 30
|
||||
|
||||
# Used by test cases
|
||||
STAGING_FILE_NAME = "double-sided-staging.pdf"
|
||||
|
||||
|
||||
def collate(input_doc: ConsumableDocument) -> str:
|
||||
"""
|
||||
Tries to collate pages from 2 single sided scans of a double sided
|
||||
document.
|
||||
|
||||
When called with a file, it checks whether or not a staging file
|
||||
exists, if not, the current file is turned into that staging file
|
||||
containing the odd numbered pages.
|
||||
|
||||
If a staging file exists, and it is not too old, the current file is
|
||||
considered to be the second part (the even numbered pages) and it will
|
||||
collate the pages of both, the pages of the second file will be added
|
||||
in reverse order, since the ADF will have scanned the pages from bottom
|
||||
to top.
|
||||
|
||||
Returns a status message on succcess, or raises a ConsumerError
|
||||
in case of failure.
|
||||
"""
|
||||
|
||||
# Make sure scratch dir exists, Consumer might not have run yet
|
||||
settings.SCRATCH_DIR.mkdir(exist_ok=True)
|
||||
|
||||
if input_doc.mime_type == "application/pdf":
|
||||
pdf_file = input_doc.original_file
|
||||
elif (
|
||||
input_doc.mime_type == "image/tiff"
|
||||
and settings.CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT
|
||||
):
|
||||
pdf_file = convert_from_tiff_to_pdf(
|
||||
input_doc.original_file,
|
||||
settings.SCRATCH_DIR,
|
||||
)
|
||||
input_doc.original_file.unlink()
|
||||
else:
|
||||
raise ConsumerError("Unsupported file type for collation of double-sided scans")
|
||||
|
||||
staging = settings.SCRATCH_DIR / STAGING_FILE_NAME
|
||||
|
||||
valid_staging_exists = False
|
||||
if staging.exists():
|
||||
stats = os.stat(str(staging))
|
||||
# if the file is older than the timeout, we don't consider
|
||||
# it valid
|
||||
if dt.datetime.now().timestamp() - stats.st_mtime > TIMEOUT_MINUTES * 60:
|
||||
logger.warning("Outdated double sided staging file exists, deleting it")
|
||||
os.unlink(str(staging))
|
||||
else:
|
||||
valid_staging_exists = True
|
||||
|
||||
if valid_staging_exists:
|
||||
try:
|
||||
# Collate pages from second PDF in reverse order
|
||||
with Pdf.open(staging) as pdf1, Pdf.open(pdf_file) as pdf2:
|
||||
pdf2.pages.reverse()
|
||||
try:
|
||||
for i, page in enumerate(pdf2.pages):
|
||||
pdf1.pages.insert(2 * i + 1, page)
|
||||
except IndexError:
|
||||
raise ConsumerError(
|
||||
"This second file (even numbered pages) contains more "
|
||||
"pages than the first/odd numbered one. This means the "
|
||||
"two uploaded files don't belong to the same double-"
|
||||
"sided scan. Please retry, starting with the odd "
|
||||
"numbered pages again.",
|
||||
)
|
||||
# Merged file has the same path, but without the
|
||||
# double-sided subdir. Therefore, it is also in the
|
||||
# consumption dir and will be picked up for processing
|
||||
old_file = input_doc.original_file
|
||||
new_file = Path(
|
||||
*(
|
||||
part
|
||||
for part in old_file.with_name(
|
||||
f"{old_file.stem}-collated.pdf",
|
||||
).parts
|
||||
if part != settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
|
||||
),
|
||||
)
|
||||
# If the user didn't create the subdirs yet, do it for them
|
||||
new_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
pdf1.save(new_file)
|
||||
logger.info("Collated documents into new file %s", new_file)
|
||||
return (
|
||||
"Success. Even numbered pages of double sided scan collated "
|
||||
"with odd pages"
|
||||
)
|
||||
finally:
|
||||
# Delete staging and recently uploaded file no matter what.
|
||||
# If any error occurs, the user needs to be able to restart
|
||||
# the process from scratch; after all, the staging file
|
||||
# with the odd numbered pages might be the culprit
|
||||
pdf_file.unlink()
|
||||
staging.unlink()
|
||||
|
||||
else:
|
||||
# In Python 3.9 move supports Path objects directly,
|
||||
# but for now we have to be compatible with 3.8
|
||||
shutil.move(str(pdf_file), str(staging))
|
||||
# update access to modification time so we know if the file
|
||||
# is outdated when another file gets uploaded
|
||||
os.utime(str(staging), (dt.datetime.now().timestamp(),) * 2)
|
||||
logger.info(
|
||||
"Got scan with odd numbered pages of double-sided scan, moved it to %s",
|
||||
staging,
|
||||
)
|
||||
return (
|
||||
"Received odd numbered pages of double sided scan, waiting up to "
|
||||
f"{TIMEOUT_MINUTES} minutes for even numbered pages"
|
||||
)
|
@@ -218,6 +218,7 @@ def generate_filename(
|
||||
tag_list=tag_list,
|
||||
owner_username=owner_username_str,
|
||||
original_name=original_name,
|
||||
doc_pk=f"{doc.pk:07}",
|
||||
).strip()
|
||||
|
||||
if settings.FILENAME_FORMAT_REMOVE_NONE:
|
||||
|
@@ -11,13 +11,17 @@ from typing import Set
|
||||
import tqdm
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import Group
|
||||
from django.contrib.auth.models import Permission
|
||||
from django.contrib.auth.models import User
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
from django.core import serializers
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.core.management.base import CommandError
|
||||
from django.db import transaction
|
||||
from django.utils import timezone
|
||||
from filelock import FileLock
|
||||
from guardian.models import GroupObjectPermission
|
||||
from guardian.models import UserObjectPermission
|
||||
|
||||
from documents.file_handling import delete_empty_directories
|
||||
from documents.file_handling import generate_filename
|
||||
@@ -33,6 +37,7 @@ from documents.models import UiSettings
|
||||
from documents.settings import EXPORTER_ARCHIVE_NAME
|
||||
from documents.settings import EXPORTER_FILE_NAME
|
||||
from documents.settings import EXPORTER_THUMBNAIL_NAME
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from paperless import version
|
||||
from paperless.db import GnuPG
|
||||
from paperless_mail.models import MailAccount
|
||||
@@ -261,6 +266,22 @@ class Command(BaseCommand):
|
||||
serializers.serialize("json", UiSettings.objects.all()),
|
||||
)
|
||||
|
||||
manifest += json.loads(
|
||||
serializers.serialize("json", ContentType.objects.all()),
|
||||
)
|
||||
|
||||
manifest += json.loads(
|
||||
serializers.serialize("json", Permission.objects.all()),
|
||||
)
|
||||
|
||||
manifest += json.loads(
|
||||
serializers.serialize("json", UserObjectPermission.objects.all()),
|
||||
)
|
||||
|
||||
manifest += json.loads(
|
||||
serializers.serialize("json", GroupObjectPermission.objects.all()),
|
||||
)
|
||||
|
||||
# 3. Export files from each document
|
||||
for index, document_dict in tqdm.tqdm(
|
||||
enumerate(document_manifest),
|
||||
@@ -417,4 +438,4 @@ class Command(BaseCommand):
|
||||
|
||||
if perform_copy:
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(source, target)
|
||||
copy_file_with_basic_stats(source, target)
|
||||
|
@@ -1,17 +1,20 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
|
||||
import tqdm
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import Permission
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
from django.core.exceptions import FieldDoesNotExist
|
||||
from django.core.management import call_command
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.core.management.base import CommandError
|
||||
from django.core.serializers.base import DeserializationError
|
||||
from django.db import IntegrityError
|
||||
from django.db import transaction
|
||||
from django.db.models.signals import m2m_changed
|
||||
from django.db.models.signals import post_save
|
||||
from filelock import FileLock
|
||||
@@ -23,6 +26,7 @@ from documents.settings import EXPORTER_ARCHIVE_NAME
|
||||
from documents.settings import EXPORTER_FILE_NAME
|
||||
from documents.settings import EXPORTER_THUMBNAIL_NAME
|
||||
from documents.signals.handlers import update_filename_and_move_files
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from paperless import version
|
||||
|
||||
|
||||
@@ -116,9 +120,13 @@ class Command(BaseCommand):
|
||||
):
|
||||
# Fill up the database with whatever is in the manifest
|
||||
try:
|
||||
for manifest_path in manifest_paths:
|
||||
call_command("loaddata", manifest_path)
|
||||
except (FieldDoesNotExist, DeserializationError) as e:
|
||||
with transaction.atomic():
|
||||
for manifest_path in manifest_paths:
|
||||
# delete these since pk can change, re-created from import
|
||||
ContentType.objects.all().delete()
|
||||
Permission.objects.all().delete()
|
||||
call_command("loaddata", manifest_path)
|
||||
except (FieldDoesNotExist, DeserializationError, IntegrityError) as e:
|
||||
self.stdout.write(self.style.ERROR("Database import failed"))
|
||||
if (
|
||||
self.version is not None
|
||||
@@ -238,7 +246,7 @@ class Command(BaseCommand):
|
||||
|
||||
create_source_path_directory(document.source_path)
|
||||
|
||||
shutil.copy2(document_path, document.source_path)
|
||||
copy_file_with_basic_stats(document_path, document.source_path)
|
||||
|
||||
if thumbnail_path:
|
||||
if thumbnail_path.suffix in {".png", ".PNG"}:
|
||||
@@ -253,13 +261,16 @@ class Command(BaseCommand):
|
||||
output_file=str(document.thumbnail_path),
|
||||
)
|
||||
else:
|
||||
shutil.copy2(thumbnail_path, document.thumbnail_path)
|
||||
copy_file_with_basic_stats(
|
||||
thumbnail_path,
|
||||
document.thumbnail_path,
|
||||
)
|
||||
|
||||
if archive_path:
|
||||
create_source_path_directory(document.archive_path)
|
||||
# TODO: this assumes that the export is valid and
|
||||
# archive_filename is present on all documents with
|
||||
# archived files
|
||||
shutil.copy2(archive_path, document.archive_path)
|
||||
copy_file_with_basic_stats(archive_path, document.archive_path)
|
||||
|
||||
document.save()
|
||||
|
@@ -1,7 +1,9 @@
|
||||
import logging
|
||||
import re
|
||||
|
||||
from documents.classifier import DocumentClassifier
|
||||
from documents.models import Correspondent
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
from documents.models import MatchingModel
|
||||
from documents.models import StoragePath
|
||||
@@ -11,7 +13,7 @@ from documents.permissions import get_objects_for_user_owner_aware
|
||||
logger = logging.getLogger("paperless.matching")
|
||||
|
||||
|
||||
def log_reason(matching_model, document, reason):
|
||||
def log_reason(matching_model: MatchingModel, document: Document, reason: str):
|
||||
class_name = type(matching_model).__name__
|
||||
logger.debug(
|
||||
f"{class_name} {matching_model.name} matched on document "
|
||||
@@ -19,7 +21,7 @@ def log_reason(matching_model, document, reason):
|
||||
)
|
||||
|
||||
|
||||
def match_correspondents(document, classifier, user=None):
|
||||
def match_correspondents(document: Document, classifier: DocumentClassifier, user=None):
|
||||
pred_id = classifier.predict_correspondent(document.content) if classifier else None
|
||||
|
||||
if user is None and document.owner is not None:
|
||||
@@ -35,11 +37,15 @@ def match_correspondents(document, classifier, user=None):
|
||||
correspondents = Correspondent.objects.all()
|
||||
|
||||
return list(
|
||||
filter(lambda o: matches(o, document) or o.pk == pred_id, correspondents),
|
||||
filter(
|
||||
lambda o: matches(o, document)
|
||||
or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
|
||||
correspondents,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def match_document_types(document, classifier, user=None):
|
||||
def match_document_types(document: Document, classifier: DocumentClassifier, user=None):
|
||||
pred_id = classifier.predict_document_type(document.content) if classifier else None
|
||||
|
||||
if user is None and document.owner is not None:
|
||||
@@ -55,11 +61,15 @@ def match_document_types(document, classifier, user=None):
|
||||
document_types = DocumentType.objects.all()
|
||||
|
||||
return list(
|
||||
filter(lambda o: matches(o, document) or o.pk == pred_id, document_types),
|
||||
filter(
|
||||
lambda o: matches(o, document)
|
||||
or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
|
||||
document_types,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def match_tags(document, classifier, user=None):
|
||||
def match_tags(document: Document, classifier: DocumentClassifier, user=None):
|
||||
predicted_tag_ids = classifier.predict_tags(document.content) if classifier else []
|
||||
|
||||
if user is None and document.owner is not None:
|
||||
@@ -71,11 +81,18 @@ def match_tags(document, classifier, user=None):
|
||||
tags = Tag.objects.all()
|
||||
|
||||
return list(
|
||||
filter(lambda o: matches(o, document) or o.pk in predicted_tag_ids, tags),
|
||||
filter(
|
||||
lambda o: matches(o, document)
|
||||
or (
|
||||
o.matching_algorithm == MatchingModel.MATCH_AUTO
|
||||
and o.pk in predicted_tag_ids
|
||||
),
|
||||
tags,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def match_storage_paths(document, classifier, user=None):
|
||||
def match_storage_paths(document: Document, classifier: DocumentClassifier, user=None):
|
||||
pred_id = classifier.predict_storage_path(document.content) if classifier else None
|
||||
|
||||
if user is None and document.owner is not None:
|
||||
@@ -92,13 +109,14 @@ def match_storage_paths(document, classifier, user=None):
|
||||
|
||||
return list(
|
||||
filter(
|
||||
lambda o: matches(o, document) or o.pk == pred_id,
|
||||
lambda o: matches(o, document)
|
||||
or (o.pk == pred_id and o.matching_algorithm == MatchingModel.MATCH_AUTO),
|
||||
storage_paths,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def matches(matching_model, document):
|
||||
def matches(matching_model: MatchingModel, document: Document):
|
||||
search_kwargs = {}
|
||||
|
||||
document_content = document.content
|
||||
|
@@ -0,0 +1,162 @@
|
||||
# Generated by Django 4.1.9 on 2023-06-29 19:29
|
||||
import logging
|
||||
import multiprocessing.pool
|
||||
import shutil
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import gnupg
|
||||
from django.conf import settings
|
||||
from django.db import migrations
|
||||
|
||||
from documents.parsers import run_convert
|
||||
|
||||
logger = logging.getLogger("paperless.migrations")
|
||||
|
||||
|
||||
def _do_convert(work_package):
|
||||
(
|
||||
existing_encrypted_thumbnail,
|
||||
converted_encrypted_thumbnail,
|
||||
passphrase,
|
||||
) = work_package
|
||||
|
||||
try:
|
||||
gpg = gnupg.GPG(gnupghome=settings.GNUPG_HOME)
|
||||
|
||||
logger.info(f"Decrypting thumbnail: {existing_encrypted_thumbnail}")
|
||||
|
||||
# Decrypt png
|
||||
decrypted_thumbnail = existing_encrypted_thumbnail.with_suffix("").resolve()
|
||||
|
||||
with open(existing_encrypted_thumbnail, "rb") as existing_encrypted_file:
|
||||
raw_thumb = gpg.decrypt_file(
|
||||
existing_encrypted_file,
|
||||
passphrase=passphrase,
|
||||
always_trust=True,
|
||||
).data
|
||||
with open(decrypted_thumbnail, "wb") as decrypted_file:
|
||||
decrypted_file.write(raw_thumb)
|
||||
|
||||
converted_decrypted_thumbnail = Path(
|
||||
str(converted_encrypted_thumbnail).replace("webp.gpg", "webp"),
|
||||
).resolve()
|
||||
|
||||
logger.info(f"Converting decrypted thumbnail: {decrypted_thumbnail}")
|
||||
|
||||
# Convert to webp
|
||||
run_convert(
|
||||
density=300,
|
||||
scale="500x5000>",
|
||||
alpha="remove",
|
||||
strip=True,
|
||||
trim=False,
|
||||
auto_orient=True,
|
||||
input_file=f"{decrypted_thumbnail}[0]",
|
||||
output_file=str(converted_decrypted_thumbnail),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Encrypting converted thumbnail: {converted_decrypted_thumbnail}",
|
||||
)
|
||||
|
||||
# Encrypt webp
|
||||
with open(converted_decrypted_thumbnail, "rb") as converted_decrypted_file:
|
||||
encrypted = gpg.encrypt_file(
|
||||
fileobj_or_path=converted_decrypted_file,
|
||||
recipients=None,
|
||||
passphrase=passphrase,
|
||||
symmetric=True,
|
||||
always_trust=True,
|
||||
).data
|
||||
|
||||
with open(converted_encrypted_thumbnail, "wb") as converted_encrypted_file:
|
||||
converted_encrypted_file.write(encrypted)
|
||||
|
||||
# Copy newly created thumbnail to thumbnail directory
|
||||
shutil.copy(converted_encrypted_thumbnail, existing_encrypted_thumbnail.parent)
|
||||
|
||||
# Remove the existing encrypted PNG version
|
||||
existing_encrypted_thumbnail.unlink()
|
||||
|
||||
# Remove the decrypted PNG version
|
||||
decrypted_thumbnail.unlink()
|
||||
|
||||
# Remove the decrypted WebP version
|
||||
converted_decrypted_thumbnail.unlink()
|
||||
|
||||
logger.info(
|
||||
"Conversion to WebP completed, "
|
||||
f"replaced {existing_encrypted_thumbnail.name} with {converted_encrypted_thumbnail.name}",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error converting thumbnail (existing file unchanged): {e}")
|
||||
|
||||
|
||||
def _convert_encrypted_thumbnails_to_webp(apps, schema_editor):
|
||||
start = time.time()
|
||||
|
||||
with tempfile.TemporaryDirectory() as tempdir:
|
||||
work_packages = []
|
||||
|
||||
if len(list(Path(settings.THUMBNAIL_DIR).glob("*.png.gpg"))) > 0:
|
||||
passphrase = settings.PASSPHRASE
|
||||
|
||||
if not passphrase:
|
||||
raise Exception(
|
||||
"Passphrase not defined, encrypted thumbnails cannot be migrated"
|
||||
"without this",
|
||||
)
|
||||
|
||||
for file in Path(settings.THUMBNAIL_DIR).glob("*.png.gpg"):
|
||||
existing_thumbnail = file.resolve()
|
||||
|
||||
# Change the existing filename suffix from png to webp
|
||||
converted_thumbnail_name = Path(
|
||||
str(existing_thumbnail).replace(".png.gpg", ".webp.gpg"),
|
||||
).name
|
||||
|
||||
# Create the expected output filename in the tempdir
|
||||
converted_thumbnail = (
|
||||
Path(tempdir) / Path(converted_thumbnail_name)
|
||||
).resolve()
|
||||
|
||||
# Package up the necessary info
|
||||
work_packages.append(
|
||||
(existing_thumbnail, converted_thumbnail, passphrase),
|
||||
)
|
||||
|
||||
if len(work_packages):
|
||||
logger.info(
|
||||
"\n\n"
|
||||
" This is a one-time only migration to convert thumbnails for all of your\n"
|
||||
" *encrypted* documents into WebP format. If you have a lot of encrypted documents, \n"
|
||||
" this may take a while, so a coffee break may be in order."
|
||||
"\n",
|
||||
)
|
||||
|
||||
with multiprocessing.pool.Pool(
|
||||
processes=min(multiprocessing.cpu_count(), 4),
|
||||
maxtasksperchild=4,
|
||||
) as pool:
|
||||
pool.map(_do_convert, work_packages)
|
||||
|
||||
end = time.time()
|
||||
duration = end - start
|
||||
|
||||
logger.info(f"Conversion completed in {duration:.3f}s")
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("documents", "1036_alter_savedviewfilterrule_rule_type"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(
|
||||
code=_convert_encrypted_thumbnails_to_webp,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
]
|
@@ -18,6 +18,7 @@ from django.utils import timezone
|
||||
|
||||
from documents.loggers import LoggingMixin
|
||||
from documents.signals import document_consumer_declaration
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
|
||||
# This regular expression will try to find dates in the document at
|
||||
# hand and will match the following formats:
|
||||
@@ -31,16 +32,18 @@ from documents.signals import document_consumer_declaration
|
||||
# - MONTH ZZZZ, with ZZZZ being 4 digits
|
||||
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
|
||||
# - XX MON ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits. MONTH is 3 letters
|
||||
# - XXPP MONTH ZZZZ with XX being 1 or 2 and PP being 2 letters and ZZZZ being 4 digits
|
||||
|
||||
# TODO: isnt there a date parsing library for this?
|
||||
|
||||
DATE_REGEX = re.compile(
|
||||
r"(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|" # noqa: E501
|
||||
r"(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|" # noqa: E501
|
||||
r"(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|" # noqa: E501
|
||||
r"(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[a-zA-Z]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|" # noqa: E501
|
||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))|"
|
||||
r"(\b|(?!=([_-])))(\b[0-9]{1,2}[ \.\/-][A-Z]{3}[ \.\/-][0-9]{4})(\b|(?=([_-])))", # noqa: E501
|
||||
r"(\b|(?!=([_-])))([0-9]{1,2}[^ ]{2}[\. ]+[^ ]{3,9}[ \.\/-][0-9]{4})(\b|(?=([_-])))|" # noqa: E501
|
||||
r"(\b|(?!=([_-])))(\b[0-9]{1,2}[ \.\/-][a-zA-Z]{3}[ \.\/-][0-9]{4})(\b|(?=([_-])))", # noqa: E501
|
||||
)
|
||||
|
||||
|
||||
@@ -206,7 +209,7 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -
|
||||
# so we need to copy it before it gets moved.
|
||||
# https://github.com/paperless-ngx/paperless-ngx/issues/3631
|
||||
default_thumbnail_path = os.path.join(temp_dir, "document.png")
|
||||
shutil.copy2(get_default_thumbnail(), default_thumbnail_path)
|
||||
copy_file_with_basic_stats(get_default_thumbnail(), default_thumbnail_path)
|
||||
return default_thumbnail_path
|
||||
|
||||
|
||||
|
@@ -1,6 +1,7 @@
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from typing import Optional
|
||||
|
||||
from celery import states
|
||||
from celery.signals import before_task_publish
|
||||
@@ -21,6 +22,7 @@ from django.utils import timezone
|
||||
from filelock import FileLock
|
||||
|
||||
from documents import matching
|
||||
from documents.classifier import DocumentClassifier
|
||||
from documents.file_handling import create_source_path_directory
|
||||
from documents.file_handling import delete_empty_directories
|
||||
from documents.file_handling import generate_unique_filename
|
||||
@@ -33,7 +35,7 @@ from documents.permissions import get_objects_for_user_owner_aware
|
||||
logger = logging.getLogger("paperless.handlers")
|
||||
|
||||
|
||||
def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
|
||||
def add_inbox_tags(sender, document: Document, logging_group=None, **kwargs):
|
||||
if document.owner is not None:
|
||||
tags = get_objects_for_user_owner_aware(
|
||||
document.owner,
|
||||
@@ -48,9 +50,9 @@ def add_inbox_tags(sender, document=None, logging_group=None, **kwargs):
|
||||
|
||||
def set_correspondent(
|
||||
sender,
|
||||
document=None,
|
||||
document: Document,
|
||||
logging_group=None,
|
||||
classifier=None,
|
||||
classifier: Optional[DocumentClassifier] = None,
|
||||
replace=False,
|
||||
use_first=True,
|
||||
suggest=False,
|
||||
@@ -111,9 +113,9 @@ def set_correspondent(
|
||||
|
||||
def set_document_type(
|
||||
sender,
|
||||
document=None,
|
||||
document: Document,
|
||||
logging_group=None,
|
||||
classifier=None,
|
||||
classifier: Optional[DocumentClassifier] = None,
|
||||
replace=False,
|
||||
use_first=True,
|
||||
suggest=False,
|
||||
@@ -175,9 +177,9 @@ def set_document_type(
|
||||
|
||||
def set_tags(
|
||||
sender,
|
||||
document=None,
|
||||
document: Document,
|
||||
logging_group=None,
|
||||
classifier=None,
|
||||
classifier: Optional[DocumentClassifier] = None,
|
||||
replace=False,
|
||||
suggest=False,
|
||||
base_url=None,
|
||||
@@ -239,9 +241,9 @@ def set_tags(
|
||||
|
||||
def set_storage_path(
|
||||
sender,
|
||||
document=None,
|
||||
document: Document,
|
||||
logging_group=None,
|
||||
classifier=None,
|
||||
classifier: Optional[DocumentClassifier] = None,
|
||||
replace=False,
|
||||
use_first=True,
|
||||
suggest=False,
|
||||
@@ -491,7 +493,7 @@ def update_filename_and_move_files(sender, instance: Document, **kwargs):
|
||||
)
|
||||
|
||||
|
||||
def set_log_entry(sender, document=None, logging_group=None, **kwargs):
|
||||
def set_log_entry(sender, document: Document, logging_group=None, **kwargs):
|
||||
ct = ContentType.objects.get(model="document")
|
||||
user = User.objects.get(username="consumer")
|
||||
|
||||
|
@@ -25,6 +25,7 @@ from documents.consumer import Consumer
|
||||
from documents.consumer import ConsumerError
|
||||
from documents.data_models import ConsumableDocument
|
||||
from documents.data_models import DocumentMetadataOverrides
|
||||
from documents.double_sided import collate
|
||||
from documents.file_handling import create_source_path_directory
|
||||
from documents.file_handling import generate_unique_filename
|
||||
from documents.models import Correspondent
|
||||
@@ -64,6 +65,12 @@ def train_classifier():
|
||||
and not Correspondent.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
|
||||
and not StoragePath.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
|
||||
):
|
||||
logger.info("No automatic matching items, not training")
|
||||
# Special case, items were once auto and trained, so remove the model
|
||||
# and prevent its use again
|
||||
if settings.MODEL_FILE.exists():
|
||||
logger.info(f"Removing {settings.MODEL_FILE} so it won't be used")
|
||||
settings.MODEL_FILE.unlink()
|
||||
return
|
||||
|
||||
classifier = load_classifier()
|
||||
@@ -89,10 +96,40 @@ def consume_file(
|
||||
input_doc: ConsumableDocument,
|
||||
overrides: Optional[DocumentMetadataOverrides] = None,
|
||||
):
|
||||
def send_progress(status="SUCCESS", message="finished"):
|
||||
payload = {
|
||||
"filename": overrides.filename or input_doc.original_file.name,
|
||||
"task_id": None,
|
||||
"current_progress": 100,
|
||||
"max_progress": 100,
|
||||
"status": status,
|
||||
"message": message,
|
||||
}
|
||||
try:
|
||||
async_to_sync(get_channel_layer().group_send)(
|
||||
"status_updates",
|
||||
{"type": "status_update", "data": payload},
|
||||
)
|
||||
except ConnectionError as e:
|
||||
logger.warning(f"ConnectionError on status send: {e!s}")
|
||||
|
||||
# Default no overrides
|
||||
if overrides is None:
|
||||
overrides = DocumentMetadataOverrides()
|
||||
|
||||
# Handle collation of double-sided documents scanned in two parts
|
||||
if settings.CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED and (
|
||||
settings.CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME
|
||||
in input_doc.original_file.parts
|
||||
):
|
||||
try:
|
||||
msg = collate(input_doc)
|
||||
send_progress(message=msg)
|
||||
return msg
|
||||
except ConsumerError as e:
|
||||
send_progress(status="FAILURE", message=e.args[0])
|
||||
raise e
|
||||
|
||||
# read all barcodes in the current document
|
||||
if settings.CONSUMER_ENABLE_BARCODES or settings.CONSUMER_ENABLE_ASN_BARCODE:
|
||||
with BarcodeReader(input_doc.original_file, input_doc.mime_type) as reader:
|
||||
@@ -102,32 +139,18 @@ def consume_file(
|
||||
):
|
||||
# notify the sender, otherwise the progress bar
|
||||
# in the UI stays stuck
|
||||
payload = {
|
||||
"filename": overrides.filename or input_doc.original_file.name,
|
||||
"task_id": None,
|
||||
"current_progress": 100,
|
||||
"max_progress": 100,
|
||||
"status": "SUCCESS",
|
||||
"message": "finished",
|
||||
}
|
||||
try:
|
||||
async_to_sync(get_channel_layer().group_send)(
|
||||
"status_updates",
|
||||
{"type": "status_update", "data": payload},
|
||||
)
|
||||
except ConnectionError as e:
|
||||
logger.warning(f"ConnectionError on status send: {e!s}")
|
||||
send_progress()
|
||||
# consuming stops here, since the original document with
|
||||
# the barcodes has been split and will be consumed separately
|
||||
|
||||
input_doc.original_file.unlink()
|
||||
return "File successfully split"
|
||||
|
||||
# try reading the ASN from barcode
|
||||
if settings.CONSUMER_ENABLE_ASN_BARCODE:
|
||||
if settings.CONSUMER_ENABLE_ASN_BARCODE and reader.asn is not None:
|
||||
# Note this will take precedence over an API provided ASN
|
||||
# But it's from a physical barcode, so that's good
|
||||
overrides.asn = reader.asn
|
||||
if overrides.asn:
|
||||
logger.info(f"Found ASN in barcode: {overrides.asn}")
|
||||
logger.info(f"Found ASN in barcode: {overrides.asn}")
|
||||
|
||||
# continue with consumption if no barcode was found
|
||||
document = Consumer().try_consume_file(
|
||||
|
BIN
src/documents/tests/samples/double-sided-even.pdf
Normal file
BIN
src/documents/tests/samples/double-sided-even.pdf
Normal file
Binary file not shown.
BIN
src/documents/tests/samples/double-sided-odd.pdf
Normal file
BIN
src/documents/tests/samples/double-sided-odd.pdf
Normal file
Binary file not shown.
@@ -2369,6 +2369,62 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
|
||||
|
||||
self.assertEqual(resp_data["note"], "this is a posted note")
|
||||
|
||||
def test_notes_permissions_aware(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Existing document owned by user2 but with granted view perms for user1
|
||||
WHEN:
|
||||
- API request is made by user1 to add a note or delete
|
||||
THEN:
|
||||
- Notes are neither created nor deleted
|
||||
"""
|
||||
user1 = User.objects.create_user(username="test1")
|
||||
user1.user_permissions.add(*Permission.objects.all())
|
||||
user1.save()
|
||||
|
||||
user2 = User.objects.create_user(username="test2")
|
||||
user2.save()
|
||||
|
||||
doc = Document.objects.create(
|
||||
title="test",
|
||||
mime_type="application/pdf",
|
||||
content="this is a document which will have notes added",
|
||||
)
|
||||
doc.owner = user2
|
||||
doc.save()
|
||||
|
||||
self.client.force_authenticate(user1)
|
||||
|
||||
resp = self.client.get(
|
||||
f"/api/documents/{doc.pk}/notes/",
|
||||
format="json",
|
||||
)
|
||||
self.assertEqual(resp.content, b"Insufficient permissions to view")
|
||||
self.assertEqual(resp.status_code, status.HTTP_403_FORBIDDEN)
|
||||
|
||||
assign_perm("view_document", user1, doc)
|
||||
|
||||
resp = self.client.post(
|
||||
f"/api/documents/{doc.pk}/notes/",
|
||||
data={"note": "this is a posted note"},
|
||||
)
|
||||
self.assertEqual(resp.content, b"Insufficient permissions to create")
|
||||
self.assertEqual(resp.status_code, status.HTTP_403_FORBIDDEN)
|
||||
|
||||
note = Note.objects.create(
|
||||
note="This is a note.",
|
||||
document=doc,
|
||||
user=user2,
|
||||
)
|
||||
|
||||
response = self.client.delete(
|
||||
f"/api/documents/{doc.pk}/notes/?id={note.pk}",
|
||||
format="json",
|
||||
)
|
||||
|
||||
self.assertEqual(response.content, b"Insufficient permissions to delete")
|
||||
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
|
||||
|
||||
def test_delete_note(self):
|
||||
"""
|
||||
GIVEN:
|
||||
|
@@ -21,6 +21,7 @@ from django.utils import timezone
|
||||
|
||||
from documents.consumer import Consumer
|
||||
from documents.consumer import ConsumerError
|
||||
from documents.consumer import ConsumerFilePhase
|
||||
from documents.models import Correspondent
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentType
|
||||
@@ -228,8 +229,8 @@ def fake_magic_from_file(file, mime=False):
|
||||
class TestConsumer(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
def _assert_first_last_send_progress(
|
||||
self,
|
||||
first_status="STARTING",
|
||||
last_status="SUCCESS",
|
||||
first_status=ConsumerFilePhase.STARTED,
|
||||
last_status=ConsumerFilePhase.SUCCESS,
|
||||
first_progress=0,
|
||||
first_progress_max=100,
|
||||
last_progress=100,
|
||||
@@ -561,10 +562,16 @@ class TestConsumer(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
|
||||
@mock.patch("documents.consumer.load_classifier")
|
||||
def testClassifyDocument(self, m):
|
||||
correspondent = Correspondent.objects.create(name="test")
|
||||
dtype = DocumentType.objects.create(name="test")
|
||||
t1 = Tag.objects.create(name="t1")
|
||||
t2 = Tag.objects.create(name="t2")
|
||||
correspondent = Correspondent.objects.create(
|
||||
name="test",
|
||||
matching_algorithm=Correspondent.MATCH_AUTO,
|
||||
)
|
||||
dtype = DocumentType.objects.create(
|
||||
name="test",
|
||||
matching_algorithm=DocumentType.MATCH_AUTO,
|
||||
)
|
||||
t1 = Tag.objects.create(name="t1", matching_algorithm=Tag.MATCH_AUTO)
|
||||
t2 = Tag.objects.create(name="t2", matching_algorithm=Tag.MATCH_AUTO)
|
||||
|
||||
m.return_value = MagicMock()
|
||||
m.return_value.predict_correspondent.return_value = correspondent.pk
|
||||
|
@@ -152,6 +152,55 @@ class TestDate(TestCase):
|
||||
text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304"
|
||||
self.assertIsNone(parse_date("", text), None)
|
||||
|
||||
def test_date_format_19(self):
|
||||
text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304"
|
||||
self.assertEqual(
|
||||
parse_date("", text),
|
||||
datetime.datetime(2022, 3, 21, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||
)
|
||||
|
||||
def test_date_format_20(self):
|
||||
text = "Customer Number Currency 22nd March 2022 Credit Card 1934829304"
|
||||
self.assertEqual(
|
||||
parse_date("", text),
|
||||
datetime.datetime(2022, 3, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||
)
|
||||
|
||||
def test_date_format_21(self):
|
||||
text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304"
|
||||
self.assertEqual(
|
||||
parse_date("", text),
|
||||
datetime.datetime(2022, 3, 2, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||
)
|
||||
|
||||
def test_date_format_22(self):
|
||||
text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304"
|
||||
self.assertEqual(
|
||||
parse_date("", text),
|
||||
datetime.datetime(2022, 3, 23, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||
)
|
||||
|
||||
def test_date_format_23(self):
|
||||
text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304"
|
||||
self.assertEqual(
|
||||
parse_date("", text),
|
||||
datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||
)
|
||||
|
||||
def test_date_format_24(self):
|
||||
text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304"
|
||||
self.assertEqual(
|
||||
parse_date("", text),
|
||||
datetime.datetime(2022, 3, 21, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||
)
|
||||
|
||||
def test_date_format_25(self):
|
||||
text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304"
|
||||
self.assertEqual(
|
||||
parse_date("", text),
|
||||
datetime.datetime(2022, 3, 25, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
|
||||
)
|
||||
|
||||
def test_crazy_date_past(self, *args):
|
||||
self.assertIsNone(parse_date("", "01-07-0590 00:00:00"))
|
||||
|
||||
|
253
src/documents/tests/test_double_sided.py
Normal file
253
src/documents/tests/test_double_sided.py
Normal file
@@ -0,0 +1,253 @@
|
||||
import datetime as dt
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
from unittest import mock
|
||||
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
from pdfminer.high_level import extract_text
|
||||
from pikepdf import Pdf
|
||||
|
||||
from documents import tasks
|
||||
from documents.consumer import ConsumerError
|
||||
from documents.data_models import ConsumableDocument
|
||||
from documents.data_models import DocumentSource
|
||||
from documents.double_sided import STAGING_FILE_NAME
|
||||
from documents.double_sided import TIMEOUT_MINUTES
|
||||
from documents.tests.utils import DirectoriesMixin
|
||||
from documents.tests.utils import FileSystemAssertsMixin
|
||||
|
||||
|
||||
@override_settings(
|
||||
CONSUMER_RECURSIVE=True,
|
||||
CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED=True,
|
||||
)
|
||||
class TestDoubleSided(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
SAMPLE_DIR = Path(__file__).parent / "samples"
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.dirs.double_sided_dir = self.dirs.consumption_dir / "double-sided"
|
||||
self.dirs.double_sided_dir.mkdir()
|
||||
self.staging_file = self.dirs.scratch_dir / STAGING_FILE_NAME
|
||||
|
||||
def consume_file(self, srcname, dstname: Union[str, Path] = "foo.pdf"):
|
||||
"""
|
||||
Starts the consume process and also ensures the
|
||||
destination file does not exist afterwards
|
||||
"""
|
||||
src = self.SAMPLE_DIR / srcname
|
||||
dst = self.dirs.double_sided_dir / dstname
|
||||
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy(src, dst)
|
||||
with mock.patch("documents.tasks.async_to_sync"), mock.patch(
|
||||
"documents.consumer.async_to_sync",
|
||||
):
|
||||
msg = tasks.consume_file(
|
||||
ConsumableDocument(
|
||||
source=DocumentSource.ConsumeFolder,
|
||||
original_file=dst,
|
||||
),
|
||||
None,
|
||||
)
|
||||
self.assertIsNotFile(dst)
|
||||
return msg
|
||||
|
||||
def create_staging_file(self, src="double-sided-odd.pdf", datetime=None):
|
||||
shutil.copy(self.SAMPLE_DIR / src, self.staging_file)
|
||||
if datetime is None:
|
||||
datetime = dt.datetime.now()
|
||||
os.utime(str(self.staging_file), (datetime.timestamp(),) * 2)
|
||||
|
||||
def test_odd_numbered_moved_to_staging(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- No staging file exists
|
||||
WHEN:
|
||||
- A file is copied into the double-sided consume directory
|
||||
THEN:
|
||||
- The file becomes the new staging file
|
||||
- The file in the consume directory gets removed
|
||||
- The staging file has the st_mtime set to now
|
||||
- The user gets informed
|
||||
"""
|
||||
|
||||
msg = self.consume_file("double-sided-odd.pdf")
|
||||
|
||||
self.assertIsFile(self.staging_file)
|
||||
self.assertAlmostEqual(
|
||||
dt.datetime.fromtimestamp(self.staging_file.stat().st_mtime),
|
||||
dt.datetime.now(),
|
||||
delta=dt.timedelta(seconds=5),
|
||||
)
|
||||
self.assertIn("Received odd numbered pages", msg)
|
||||
|
||||
def test_collation(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- A staging file not older than TIMEOUT_MINUTES with odd pages exists
|
||||
WHEN:
|
||||
- A file is copied into the double-sided consume directory
|
||||
THEN:
|
||||
- A new file containing the collated staging and uploaded file is
|
||||
created and put into the consume directory
|
||||
- The new file is named "foo-collated.pdf", where foo is the name of
|
||||
the second file
|
||||
- Both staging and uploaded file get deleted
|
||||
- The new file contains the pages in the correct order
|
||||
"""
|
||||
|
||||
self.create_staging_file()
|
||||
self.consume_file("double-sided-even.pdf", "some-random-name.pdf")
|
||||
|
||||
target = self.dirs.consumption_dir / "some-random-name-collated.pdf"
|
||||
self.assertIsFile(target)
|
||||
self.assertIsNotFile(self.staging_file)
|
||||
self.assertRegex(
|
||||
extract_text(str(target)),
|
||||
r"(?s)"
|
||||
r"This is page 1.*This is page 2.*This is page 3.*"
|
||||
r"This is page 4.*This is page 5",
|
||||
)
|
||||
|
||||
def test_staging_file_expiration(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- A staging file older than TIMEOUT_MINUTES exists
|
||||
WHEN:
|
||||
- A file is copied into the double-sided consume directory
|
||||
THEN:
|
||||
- It becomes the new staging file
|
||||
"""
|
||||
|
||||
self.create_staging_file(
|
||||
datetime=dt.datetime.now()
|
||||
- dt.timedelta(minutes=TIMEOUT_MINUTES, seconds=1),
|
||||
)
|
||||
msg = self.consume_file("double-sided-odd.pdf")
|
||||
self.assertIsFile(self.staging_file)
|
||||
self.assertIn("Received odd numbered pages", msg)
|
||||
|
||||
def test_less_odd_pages_then_even_fails(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- A valid staging file
|
||||
WHEN:
|
||||
- A file is copied into the double-sided consume directory
|
||||
that has more pages than the staging file
|
||||
THEN:
|
||||
- Both files get removed
|
||||
- A ConsumerError exception is thrown
|
||||
"""
|
||||
self.create_staging_file("simple.pdf")
|
||||
self.assertRaises(
|
||||
ConsumerError,
|
||||
self.consume_file,
|
||||
"double-sided-even.pdf",
|
||||
)
|
||||
self.assertIsNotFile(self.staging_file)
|
||||
|
||||
@override_settings(CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT=True)
|
||||
def test_tiff_upload_enabled(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT is true
|
||||
- No staging file exists
|
||||
WHEN:
|
||||
- A TIFF file gets uploaded into the double-sided
|
||||
consume dir
|
||||
THEN:
|
||||
- The file is converted into a PDF and moved to
|
||||
the staging file
|
||||
"""
|
||||
self.consume_file("simple.tiff", "simple.tiff")
|
||||
self.assertIsFile(self.staging_file)
|
||||
# Ensure the file is a valid PDF by trying to read it
|
||||
Pdf.open(self.staging_file)
|
||||
|
||||
@override_settings(CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT=False)
|
||||
def test_tiff_upload_disabled(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT is false
|
||||
- No staging file exists
|
||||
WHEN:
|
||||
- A TIFF file gets uploaded into the double-sided
|
||||
consume dir
|
||||
THEN:
|
||||
- A ConsumerError is raised
|
||||
"""
|
||||
self.assertRaises(
|
||||
ConsumerError,
|
||||
self.consume_file,
|
||||
"simple.tiff",
|
||||
"simple.tiff",
|
||||
)
|
||||
|
||||
@override_settings(CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME="quux")
|
||||
def test_different_upload_dir_name(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- No staging file exists
|
||||
- CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME is set to quux
|
||||
WHEN:
|
||||
- A file is uploaded into the quux dir
|
||||
THEN:
|
||||
- A staging file is created
|
||||
"""
|
||||
self.consume_file("double-sided-odd.pdf", Path("..") / "quux" / "foo.pdf")
|
||||
self.assertIsFile(self.staging_file)
|
||||
|
||||
def test_only_double_sided_dir_is_handled(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- No staging file exists
|
||||
WHEN:
|
||||
- A file is uploaded into the normal consumption dir
|
||||
THEN:
|
||||
- The file is processed as normal
|
||||
"""
|
||||
msg = self.consume_file("simple.pdf", Path("..") / "simple.pdf")
|
||||
self.assertIsNotFile(self.staging_file)
|
||||
self.assertRegex(msg, "Success. New document .* created")
|
||||
|
||||
def test_subdirectory_upload(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- A staging file exists
|
||||
WHEN:
|
||||
- A file gets uploaded into foo/bar/double-sided
|
||||
or double-sided/foo/bar
|
||||
THEN:
|
||||
- The collated file gets put into foo/bar
|
||||
"""
|
||||
for path in [
|
||||
Path("foo") / "bar" / "double-sided",
|
||||
Path("double-sided") / "foo" / "bar",
|
||||
]:
|
||||
with self.subTest(path=path):
|
||||
# Ensure we get fresh directories for each run
|
||||
self.tearDown()
|
||||
self.setUp()
|
||||
|
||||
self.create_staging_file()
|
||||
self.consume_file("double-sided-odd.pdf", path / "foo.pdf")
|
||||
self.assertIsFile(
|
||||
self.dirs.consumption_dir / "foo" / "bar" / "foo-collated.pdf",
|
||||
)
|
||||
|
||||
@override_settings(CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED=False)
|
||||
def test_disabled_double_sided_dir_upload(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED is false
|
||||
WHEN:
|
||||
- A file is uploaded into the double-sided directory
|
||||
THEN:
|
||||
- The file is processed like a normal upload
|
||||
"""
|
||||
msg = self.consume_file("simple.pdf")
|
||||
self.assertIsNotFile(self.staging_file)
|
||||
self.assertRegex(msg, "Success. New document .* created")
|
@@ -446,6 +446,19 @@ class TestFileHandling(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
self.assertIsNotDir(os.path.join(settings.ORIGINALS_DIR, "none"))
|
||||
self.assertIsDir(settings.ORIGINALS_DIR)
|
||||
|
||||
@override_settings(FILENAME_FORMAT="{doc_pk}")
|
||||
def test_format_doc_pk(self):
|
||||
document = Document()
|
||||
document.pk = 1
|
||||
document.mime_type = "application/pdf"
|
||||
document.storage_type = Document.STORAGE_TYPE_UNENCRYPTED
|
||||
|
||||
self.assertEqual(generate_filename(document), "0000001.pdf")
|
||||
|
||||
document.pk = 13579
|
||||
|
||||
self.assertEqual(generate_filename(document), "0013579.pdf")
|
||||
|
||||
@override_settings(FILENAME_FORMAT=None)
|
||||
def test_format_none(self):
|
||||
document = Document()
|
||||
|
@@ -7,11 +7,18 @@ from pathlib import Path
|
||||
from unittest import mock
|
||||
from zipfile import ZipFile
|
||||
|
||||
from django.contrib.auth.models import Group
|
||||
from django.contrib.auth.models import Permission
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
from django.core.management import call_command
|
||||
from django.core.management.base import CommandError
|
||||
from django.db import IntegrityError
|
||||
from django.test import TestCase
|
||||
from django.test import override_settings
|
||||
from django.utils import timezone
|
||||
from guardian.models import GroupObjectPermission
|
||||
from guardian.models import UserObjectPermission
|
||||
from guardian.shortcuts import assign_perm
|
||||
|
||||
from documents.management.commands import document_exporter
|
||||
from documents.models import Correspondent
|
||||
@@ -34,6 +41,8 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
self.addCleanup(shutil.rmtree, self.target)
|
||||
|
||||
self.user = User.objects.create(username="temp_admin")
|
||||
self.user2 = User.objects.create(username="user2")
|
||||
self.group1 = Group.objects.create(name="group1")
|
||||
|
||||
self.d1 = Document.objects.create(
|
||||
content="Content",
|
||||
@@ -73,6 +82,9 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
user=self.user,
|
||||
)
|
||||
|
||||
assign_perm("view_document", self.user2, self.d2)
|
||||
assign_perm("view_document", self.group1, self.d3)
|
||||
|
||||
self.t1 = Tag.objects.create(name="t")
|
||||
self.dt1 = DocumentType.objects.create(name="dt")
|
||||
self.c1 = Correspondent.objects.create(name="c")
|
||||
@@ -141,12 +153,12 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
|
||||
manifest = self._do_export(use_filename_format=use_filename_format)
|
||||
|
||||
self.assertEqual(len(manifest), 10)
|
||||
self.assertEqual(len(manifest), 149)
|
||||
|
||||
# dont include consumer or AnonymousUser users
|
||||
self.assertEqual(
|
||||
len(list(filter(lambda e: e["model"] == "auth.user", manifest))),
|
||||
1,
|
||||
2,
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
@@ -218,6 +230,9 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
Correspondent.objects.all().delete()
|
||||
DocumentType.objects.all().delete()
|
||||
Tag.objects.all().delete()
|
||||
Permission.objects.all().delete()
|
||||
UserObjectPermission.objects.all().delete()
|
||||
GroupObjectPermission.objects.all().delete()
|
||||
self.assertEqual(Document.objects.count(), 0)
|
||||
|
||||
call_command("document_importer", "--no-progress-bar", self.target)
|
||||
@@ -230,6 +245,9 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
self.assertEqual(Document.objects.get(id=self.d2.id).title, "wow2")
|
||||
self.assertEqual(Document.objects.get(id=self.d3.id).title, "wow2")
|
||||
self.assertEqual(Document.objects.get(id=self.d4.id).title, "wow_dec")
|
||||
self.assertEqual(GroupObjectPermission.objects.count(), 1)
|
||||
self.assertEqual(UserObjectPermission.objects.count(), 1)
|
||||
self.assertEqual(Permission.objects.count(), 108)
|
||||
messages = check_sanity()
|
||||
# everything is alright after the test
|
||||
self.assertEqual(len(messages), 0)
|
||||
@@ -259,7 +277,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
st_mtime_1 = os.stat(os.path.join(self.target, "manifest.json")).st_mtime
|
||||
|
||||
with mock.patch(
|
||||
"documents.management.commands.document_exporter.shutil.copy2",
|
||||
"documents.management.commands.document_exporter.copy_file_with_basic_stats",
|
||||
) as m:
|
||||
self._do_export()
|
||||
m.assert_not_called()
|
||||
@@ -270,7 +288,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
Path(self.d1.source_path).touch()
|
||||
|
||||
with mock.patch(
|
||||
"documents.management.commands.document_exporter.shutil.copy2",
|
||||
"documents.management.commands.document_exporter.copy_file_with_basic_stats",
|
||||
) as m:
|
||||
self._do_export()
|
||||
self.assertEqual(m.call_count, 1)
|
||||
@@ -293,7 +311,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
self.assertIsFile(os.path.join(self.target, "manifest.json"))
|
||||
|
||||
with mock.patch(
|
||||
"documents.management.commands.document_exporter.shutil.copy2",
|
||||
"documents.management.commands.document_exporter.copy_file_with_basic_stats",
|
||||
) as m:
|
||||
self._do_export()
|
||||
m.assert_not_called()
|
||||
@@ -304,7 +322,7 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
self.d2.save()
|
||||
|
||||
with mock.patch(
|
||||
"documents.management.commands.document_exporter.shutil.copy2",
|
||||
"documents.management.commands.document_exporter.copy_file_with_basic_stats",
|
||||
) as m:
|
||||
self._do_export(compare_checksums=True)
|
||||
self.assertEqual(m.call_count, 1)
|
||||
@@ -641,3 +659,47 @@ class TestExportImport(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
self.assertEqual(Document.objects.count(), 0)
|
||||
call_command("document_importer", "--no-progress-bar", self.target)
|
||||
self.assertEqual(Document.objects.count(), 4)
|
||||
|
||||
def test_import_db_transaction_failed(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Import from manifest started
|
||||
WHEN:
|
||||
- Import of database fails
|
||||
THEN:
|
||||
- ContentType & Permission objects are not deleted, db transaction rolled back
|
||||
"""
|
||||
|
||||
shutil.rmtree(os.path.join(self.dirs.media_dir, "documents"))
|
||||
shutil.copytree(
|
||||
os.path.join(os.path.dirname(__file__), "samples", "documents"),
|
||||
os.path.join(self.dirs.media_dir, "documents"),
|
||||
)
|
||||
|
||||
self.assertEqual(ContentType.objects.count(), 27)
|
||||
self.assertEqual(Permission.objects.count(), 108)
|
||||
|
||||
manifest = self._do_export()
|
||||
|
||||
with paperless_environment():
|
||||
self.assertEqual(
|
||||
len(list(filter(lambda e: e["model"] == "auth.permission", manifest))),
|
||||
108,
|
||||
)
|
||||
# add 1 more to db to show objects are not re-created by import
|
||||
Permission.objects.create(
|
||||
name="test",
|
||||
codename="test_perm",
|
||||
content_type_id=1,
|
||||
)
|
||||
self.assertEqual(Permission.objects.count(), 109)
|
||||
|
||||
# will cause an import error
|
||||
self.user.delete()
|
||||
self.user = User.objects.create(username="temp_admin")
|
||||
|
||||
with self.assertRaises(IntegrityError):
|
||||
call_command("document_importer", "--no-progress-bar", self.target)
|
||||
|
||||
self.assertEqual(ContentType.objects.count(), 27)
|
||||
self.assertEqual(Permission.objects.count(), 109)
|
||||
|
@@ -2,6 +2,7 @@ import hashlib
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from unittest import mock
|
||||
|
||||
from django.conf import settings
|
||||
@@ -60,8 +61,8 @@ def make_test_document(
|
||||
mime_type: str,
|
||||
original: str,
|
||||
original_filename: str,
|
||||
archive: str = None,
|
||||
archive_filename: str = None,
|
||||
archive: Optional[str] = None,
|
||||
archive_filename: Optional[str] = None,
|
||||
):
|
||||
doc = document_class()
|
||||
doc.filename = original_filename
|
||||
|
276
src/documents/tests/test_migration_encrypted_webp_conversion.py
Normal file
276
src/documents/tests/test_migration_encrypted_webp_conversion.py
Normal file
@@ -0,0 +1,276 @@
|
||||
import shutil
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Callable
|
||||
from typing import Iterable
|
||||
from typing import Union
|
||||
from unittest import mock
|
||||
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.tests.utils import TestMigrations
|
||||
|
||||
|
||||
@override_settings(PASSPHRASE="test")
|
||||
@mock.patch(
|
||||
"documents.migrations.1037_webp_encrypted_thumbnail_conversion.multiprocessing.pool.Pool.map",
|
||||
)
|
||||
@mock.patch("documents.migrations.1037_webp_encrypted_thumbnail_conversion.run_convert")
|
||||
class TestMigrateToEncrytpedWebPThumbnails(TestMigrations):
|
||||
migrate_from = "1036_alter_savedviewfilterrule_rule_type"
|
||||
migrate_to = "1037_webp_encrypted_thumbnail_conversion"
|
||||
auto_migrate = False
|
||||
|
||||
def pretend_convert_output(self, *args, **kwargs):
|
||||
"""
|
||||
Pretends to do the conversion, by copying the input file
|
||||
to the output file
|
||||
"""
|
||||
shutil.copy2(
|
||||
Path(kwargs["input_file"].rstrip("[0]")),
|
||||
Path(kwargs["output_file"]),
|
||||
)
|
||||
|
||||
def pretend_map(self, func: Callable, iterable: Iterable):
|
||||
"""
|
||||
Pretends to be the map of a multiprocessing.Pool, but secretly does
|
||||
everything in series
|
||||
"""
|
||||
for item in iterable:
|
||||
func(item)
|
||||
|
||||
def create_dummy_thumbnails(
|
||||
self,
|
||||
thumb_dir: Path,
|
||||
ext: str,
|
||||
count: int,
|
||||
start_count: int = 0,
|
||||
):
|
||||
"""
|
||||
Helper to create a certain count of files of given extension in a given directory
|
||||
"""
|
||||
for idx in range(count):
|
||||
(Path(thumb_dir) / Path(f"{start_count + idx:07}.{ext}")).touch()
|
||||
# Triple check expected files exist
|
||||
self.assert_file_count_by_extension(ext, thumb_dir, count)
|
||||
|
||||
def create_webp_thumbnail_files(
|
||||
self,
|
||||
thumb_dir: Path,
|
||||
count: int,
|
||||
start_count: int = 0,
|
||||
):
|
||||
"""
|
||||
Creates a dummy WebP thumbnail file in the given directory, based on
|
||||
the database Document
|
||||
"""
|
||||
self.create_dummy_thumbnails(thumb_dir, "webp", count, start_count)
|
||||
|
||||
def create_encrypted_webp_thumbnail_files(
|
||||
self,
|
||||
thumb_dir: Path,
|
||||
count: int,
|
||||
start_count: int = 0,
|
||||
):
|
||||
"""
|
||||
Creates a dummy encrypted WebP thumbnail file in the given directory, based on
|
||||
the database Document
|
||||
"""
|
||||
self.create_dummy_thumbnails(thumb_dir, "webp.gpg", count, start_count)
|
||||
|
||||
def create_png_thumbnail_files(
|
||||
self,
|
||||
thumb_dir: Path,
|
||||
count: int,
|
||||
start_count: int = 0,
|
||||
):
|
||||
"""
|
||||
Creates a dummy PNG thumbnail file in the given directory, based on
|
||||
the database Document
|
||||
"""
|
||||
|
||||
self.create_dummy_thumbnails(thumb_dir, "png", count, start_count)
|
||||
|
||||
def create_encrypted_png_thumbnail_files(
|
||||
self,
|
||||
thumb_dir: Path,
|
||||
count: int,
|
||||
start_count: int = 0,
|
||||
):
|
||||
"""
|
||||
Creates a dummy encrypted PNG thumbnail file in the given directory, based on
|
||||
the database Document
|
||||
"""
|
||||
|
||||
self.create_dummy_thumbnails(thumb_dir, "png.gpg", count, start_count)
|
||||
|
||||
def assert_file_count_by_extension(
|
||||
self,
|
||||
ext: str,
|
||||
dir: Union[str, Path],
|
||||
expected_count: int,
|
||||
):
|
||||
"""
|
||||
Helper to assert a certain count of given extension files in given directory
|
||||
"""
|
||||
if not isinstance(dir, Path):
|
||||
dir = Path(dir)
|
||||
matching_files = list(dir.glob(f"*.{ext}"))
|
||||
self.assertEqual(len(matching_files), expected_count)
|
||||
|
||||
def assert_encrypted_png_file_count(self, dir: Path, expected_count: int):
|
||||
"""
|
||||
Helper to assert a certain count of excrypted PNG extension files in given directory
|
||||
"""
|
||||
self.assert_file_count_by_extension("png.gpg", dir, expected_count)
|
||||
|
||||
def assert_encrypted_webp_file_count(self, dir: Path, expected_count: int):
|
||||
"""
|
||||
Helper to assert a certain count of encrypted WebP extension files in given directory
|
||||
"""
|
||||
self.assert_file_count_by_extension("webp.gpg", dir, expected_count)
|
||||
|
||||
def assert_webp_file_count(self, dir: Path, expected_count: int):
|
||||
"""
|
||||
Helper to assert a certain count of WebP extension files in given directory
|
||||
"""
|
||||
self.assert_file_count_by_extension("webp", dir, expected_count)
|
||||
|
||||
def assert_png_file_count(self, dir: Path, expected_count: int):
|
||||
"""
|
||||
Helper to assert a certain count of PNG extension files in given directory
|
||||
"""
|
||||
self.assert_file_count_by_extension("png", dir, expected_count)
|
||||
|
||||
def setUp(self):
|
||||
self.thumbnail_dir = Path(tempfile.mkdtemp()).resolve()
|
||||
|
||||
return super().setUp()
|
||||
|
||||
def tearDown(self) -> None:
|
||||
shutil.rmtree(self.thumbnail_dir)
|
||||
|
||||
return super().tearDown()
|
||||
|
||||
def test_do_nothing_if_converted(
|
||||
self,
|
||||
run_convert_mock: mock.MagicMock,
|
||||
map_mock: mock.MagicMock,
|
||||
):
|
||||
"""
|
||||
GIVEN:
|
||||
- Encrytped document exists with existing encrypted WebP thumbnail path
|
||||
WHEN:
|
||||
- Migration is attempted
|
||||
THEN:
|
||||
- Nothing is converted
|
||||
"""
|
||||
map_mock.side_effect = self.pretend_map
|
||||
|
||||
with override_settings(
|
||||
THUMBNAIL_DIR=self.thumbnail_dir,
|
||||
):
|
||||
self.create_encrypted_webp_thumbnail_files(self.thumbnail_dir, 3)
|
||||
|
||||
self.performMigration()
|
||||
run_convert_mock.assert_not_called()
|
||||
|
||||
self.assert_encrypted_webp_file_count(self.thumbnail_dir, 3)
|
||||
|
||||
def test_convert_thumbnails(
|
||||
self,
|
||||
run_convert_mock: mock.MagicMock,
|
||||
map_mock: mock.MagicMock,
|
||||
):
|
||||
"""
|
||||
GIVEN:
|
||||
- Encrypted documents exist with PNG thumbnail
|
||||
WHEN:
|
||||
- Migration is attempted
|
||||
THEN:
|
||||
- Thumbnails are converted to webp & re-encrypted
|
||||
"""
|
||||
map_mock.side_effect = self.pretend_map
|
||||
run_convert_mock.side_effect = self.pretend_convert_output
|
||||
|
||||
with override_settings(
|
||||
THUMBNAIL_DIR=self.thumbnail_dir,
|
||||
):
|
||||
self.create_encrypted_png_thumbnail_files(self.thumbnail_dir, 3)
|
||||
|
||||
self.performMigration()
|
||||
|
||||
run_convert_mock.assert_called()
|
||||
self.assertEqual(run_convert_mock.call_count, 3)
|
||||
|
||||
self.assert_encrypted_webp_file_count(self.thumbnail_dir, 3)
|
||||
|
||||
def test_convert_errors_out(
|
||||
self,
|
||||
run_convert_mock: mock.MagicMock,
|
||||
map_mock: mock.MagicMock,
|
||||
):
|
||||
"""
|
||||
GIVEN:
|
||||
- Encrypted document exists with PNG thumbnail
|
||||
WHEN:
|
||||
- Migration is attempted, but raises an exception
|
||||
THEN:
|
||||
- Single thumbnail is converted
|
||||
"""
|
||||
map_mock.side_effect = self.pretend_map
|
||||
run_convert_mock.side_effect = OSError
|
||||
|
||||
with override_settings(
|
||||
THUMBNAIL_DIR=self.thumbnail_dir,
|
||||
):
|
||||
self.create_encrypted_png_thumbnail_files(self.thumbnail_dir, 3)
|
||||
|
||||
self.performMigration()
|
||||
|
||||
run_convert_mock.assert_called()
|
||||
self.assertEqual(run_convert_mock.call_count, 3)
|
||||
|
||||
self.assert_encrypted_png_file_count(self.thumbnail_dir, 3)
|
||||
|
||||
def test_convert_mixed(
|
||||
self,
|
||||
run_convert_mock: mock.MagicMock,
|
||||
map_mock: mock.MagicMock,
|
||||
):
|
||||
"""
|
||||
GIVEN:
|
||||
- Documents exist with PNG, encrypted PNG and WebP thumbnails
|
||||
WHEN:
|
||||
- Migration is attempted
|
||||
THEN:
|
||||
- Only encrypted PNG thumbnails are converted
|
||||
"""
|
||||
map_mock.side_effect = self.pretend_map
|
||||
run_convert_mock.side_effect = self.pretend_convert_output
|
||||
|
||||
with override_settings(
|
||||
THUMBNAIL_DIR=self.thumbnail_dir,
|
||||
):
|
||||
self.create_png_thumbnail_files(self.thumbnail_dir, 3)
|
||||
self.create_encrypted_png_thumbnail_files(
|
||||
self.thumbnail_dir,
|
||||
3,
|
||||
start_count=3,
|
||||
)
|
||||
self.create_webp_thumbnail_files(self.thumbnail_dir, 2, start_count=6)
|
||||
self.create_encrypted_webp_thumbnail_files(
|
||||
self.thumbnail_dir,
|
||||
3,
|
||||
start_count=8,
|
||||
)
|
||||
|
||||
self.performMigration()
|
||||
|
||||
run_convert_mock.assert_called()
|
||||
self.assertEqual(run_convert_mock.call_count, 3)
|
||||
|
||||
self.assert_png_file_count(self.thumbnail_dir, 3)
|
||||
self.assert_encrypted_webp_file_count(self.thumbnail_dir, 6)
|
||||
self.assert_webp_file_count(self.thumbnail_dir, 2)
|
||||
self.assert_encrypted_png_file_count(self.thumbnail_dir, 0)
|
43
src/documents/utils.py
Normal file
43
src/documents/utils.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import shutil
|
||||
from os import utime
|
||||
from pathlib import Path
|
||||
from typing import Tuple
|
||||
from typing import Union
|
||||
|
||||
|
||||
def _coerce_to_path(
|
||||
source: Union[Path, str],
|
||||
dest: Union[Path, str],
|
||||
) -> Tuple[Path, Path]:
|
||||
return Path(source).resolve(), Path(dest).resolve()
|
||||
|
||||
|
||||
def copy_basic_file_stats(source: Union[Path, str], dest: Union[Path, str]) -> None:
|
||||
"""
|
||||
Copies only the m_time and a_time attributes from source to destination.
|
||||
Both are expected to exist.
|
||||
|
||||
The extended attribute copy does weird things with SELinux and files
|
||||
copied from temporary directories and copystat doesn't allow disabling
|
||||
these copies
|
||||
"""
|
||||
source, dest = _coerce_to_path(source, dest)
|
||||
src_stat = source.stat()
|
||||
utime(dest, ns=(src_stat.st_atime_ns, src_stat.st_mtime_ns))
|
||||
|
||||
|
||||
def copy_file_with_basic_stats(
|
||||
source: Union[Path, str],
|
||||
dest: Union[Path, str],
|
||||
) -> None:
|
||||
"""
|
||||
A sort of simpler copy2 that doesn't copy extended file attributes,
|
||||
only the access time and modified times from source to dest.
|
||||
|
||||
The extended attribute copy does weird things with SELinux and files
|
||||
copied from temporary directories.
|
||||
"""
|
||||
source, dest = _coerce_to_path(source, dest)
|
||||
|
||||
shutil.copy(source, dest)
|
||||
copy_basic_file_stats(source, dest)
|
@@ -502,19 +502,18 @@ class DocumentViewSet(
|
||||
|
||||
@action(methods=["get", "post", "delete"], detail=True)
|
||||
def notes(self, request, pk=None):
|
||||
currentUser = request.user
|
||||
try:
|
||||
doc = Document.objects.get(pk=pk)
|
||||
if request.user is not None and not has_perms_owner_aware(
|
||||
request.user,
|
||||
if currentUser is not None and not has_perms_owner_aware(
|
||||
currentUser,
|
||||
"view_document",
|
||||
doc,
|
||||
):
|
||||
return HttpResponseForbidden("Insufficient permissions")
|
||||
return HttpResponseForbidden("Insufficient permissions to view")
|
||||
except Document.DoesNotExist:
|
||||
raise Http404
|
||||
|
||||
currentUser = request.user
|
||||
|
||||
if request.method == "GET":
|
||||
try:
|
||||
return Response(self.getNotes(doc))
|
||||
@@ -525,6 +524,13 @@ class DocumentViewSet(
|
||||
)
|
||||
elif request.method == "POST":
|
||||
try:
|
||||
if currentUser is not None and not has_perms_owner_aware(
|
||||
currentUser,
|
||||
"change_document",
|
||||
doc,
|
||||
):
|
||||
return HttpResponseForbidden("Insufficient permissions to create")
|
||||
|
||||
c = Note.objects.create(
|
||||
document=doc,
|
||||
note=request.data["note"],
|
||||
@@ -545,6 +551,13 @@ class DocumentViewSet(
|
||||
},
|
||||
)
|
||||
elif request.method == "DELETE":
|
||||
if currentUser is not None and not has_perms_owner_aware(
|
||||
currentUser,
|
||||
"change_document",
|
||||
doc,
|
||||
):
|
||||
return HttpResponseForbidden("Insufficient permissions to delete")
|
||||
|
||||
note = Note.objects.get(id=int(request.GET.get("id")))
|
||||
note.delete()
|
||||
|
||||
|
@@ -791,6 +791,18 @@ CONSUMER_BARCODE_DPI: Final[str] = int(
|
||||
os.getenv("PAPERLESS_CONSUMER_BARCODE_DPI", 300),
|
||||
)
|
||||
|
||||
CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED: Final[bool] = __get_boolean(
|
||||
"PAPERLESS_CONSUMER_ENABLE_COLLATE_DOUBLE_SIDED",
|
||||
)
|
||||
|
||||
CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME: Final[str] = os.getenv(
|
||||
"PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_SUBDIR_NAME",
|
||||
"double-sided",
|
||||
)
|
||||
|
||||
CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT: Final[bool] = __get_boolean(
|
||||
"PAPERLESS_CONSUMER_COLLATE_DOUBLE_SIDED_TIFF_SUPPORT",
|
||||
)
|
||||
|
||||
OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0))
|
||||
|
||||
|
@@ -1,6 +1,7 @@
|
||||
from django import forms
|
||||
from django.contrib import admin
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
from guardian.admin import GuardedModelAdmin
|
||||
|
||||
from paperless_mail.models import MailAccount
|
||||
from paperless_mail.models import MailRule
|
||||
@@ -31,7 +32,7 @@ class MailAccountAdminForm(forms.ModelForm):
|
||||
]
|
||||
|
||||
|
||||
class MailAccountAdmin(admin.ModelAdmin):
|
||||
class MailAccountAdmin(GuardedModelAdmin):
|
||||
list_display = ("name", "imap_server", "username")
|
||||
|
||||
fieldsets = [
|
||||
@@ -45,7 +46,7 @@ class MailAccountAdmin(admin.ModelAdmin):
|
||||
form = MailAccountAdminForm
|
||||
|
||||
|
||||
class MailRuleAdmin(admin.ModelAdmin):
|
||||
class MailRuleAdmin(GuardedModelAdmin):
|
||||
radio_fields = {
|
||||
"attachment_type": admin.VERTICAL,
|
||||
"action": admin.VERTICAL,
|
||||
|
@@ -2,6 +2,7 @@ import datetime
|
||||
import itertools
|
||||
import logging
|
||||
import os
|
||||
import ssl
|
||||
import tempfile
|
||||
import traceback
|
||||
from datetime import date
|
||||
@@ -394,13 +395,12 @@ def get_mailbox(server, port, security) -> MailBox:
|
||||
"""
|
||||
Returns the correct MailBox instance for the given configuration.
|
||||
"""
|
||||
|
||||
if security == MailAccount.ImapSecurity.NONE:
|
||||
mailbox = MailBoxUnencrypted(server, port)
|
||||
elif security == MailAccount.ImapSecurity.STARTTLS:
|
||||
mailbox = MailBoxTls(server, port)
|
||||
mailbox = MailBoxTls(server, port, ssl_context=ssl.create_default_context())
|
||||
elif security == MailAccount.ImapSecurity.SSL:
|
||||
mailbox = MailBox(server, port)
|
||||
mailbox = MailBox(server, port, ssl_context=ssl.create_default_context())
|
||||
else:
|
||||
raise NotImplementedError("Unknown IMAP security") # pragma: nocover
|
||||
return mailbox
|
||||
|
@@ -25,7 +25,6 @@ class MailAccountSerializer(OwnedObjectSerializer):
|
||||
|
||||
class Meta:
|
||||
model = MailAccount
|
||||
depth = 1
|
||||
fields = [
|
||||
"id",
|
||||
"name",
|
||||
@@ -36,6 +35,10 @@ class MailAccountSerializer(OwnedObjectSerializer):
|
||||
"password",
|
||||
"character_set",
|
||||
"is_token",
|
||||
"owner",
|
||||
"user_can_change",
|
||||
"permissions",
|
||||
"set_permissions",
|
||||
]
|
||||
|
||||
def update(self, instance, validated_data):
|
||||
@@ -67,7 +70,6 @@ class MailRuleSerializer(OwnedObjectSerializer):
|
||||
|
||||
class Meta:
|
||||
model = MailRule
|
||||
depth = 1
|
||||
fields = [
|
||||
"id",
|
||||
"name",
|
||||
@@ -89,6 +91,10 @@ class MailRuleSerializer(OwnedObjectSerializer):
|
||||
"order",
|
||||
"attachment_type",
|
||||
"consumption_scope",
|
||||
"owner",
|
||||
"user_can_change",
|
||||
"permissions",
|
||||
"set_permissions",
|
||||
]
|
||||
|
||||
def update(self, instance, validated_data):
|
||||
|
@@ -1,7 +1,9 @@
|
||||
import json
|
||||
from unittest import mock
|
||||
|
||||
from django.contrib.auth.models import Permission
|
||||
from django.contrib.auth.models import User
|
||||
from guardian.shortcuts import assign_perm
|
||||
from rest_framework import status
|
||||
from rest_framework.test import APITestCase
|
||||
|
||||
@@ -27,7 +29,9 @@ class TestAPIMailAccounts(DirectoriesMixin, APITestCase):
|
||||
|
||||
super().setUp()
|
||||
|
||||
self.user = User.objects.create_superuser(username="temp_admin")
|
||||
self.user = User.objects.create_user(username="temp_admin")
|
||||
self.user.user_permissions.add(*Permission.objects.all())
|
||||
self.user.save()
|
||||
self.client.force_authenticate(user=self.user)
|
||||
|
||||
def test_get_mail_accounts(self):
|
||||
@@ -266,6 +270,73 @@ class TestAPIMailAccounts(DirectoriesMixin, APITestCase):
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["success"], True)
|
||||
|
||||
def test_get_mail_accounts_owner_aware(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Configured accounts with different users
|
||||
WHEN:
|
||||
- API call is made to get mail accounts
|
||||
THEN:
|
||||
- Only unowned, owned by user or granted accounts are provided
|
||||
"""
|
||||
|
||||
user2 = User.objects.create_user(username="temp_admin2")
|
||||
|
||||
account1 = MailAccount.objects.create(
|
||||
name="Email1",
|
||||
username="username1",
|
||||
password="password1",
|
||||
imap_server="server.example.com",
|
||||
imap_port=443,
|
||||
imap_security=MailAccount.ImapSecurity.SSL,
|
||||
character_set="UTF-8",
|
||||
)
|
||||
|
||||
account2 = MailAccount.objects.create(
|
||||
name="Email2",
|
||||
username="username2",
|
||||
password="password2",
|
||||
imap_server="server.example.com",
|
||||
imap_port=443,
|
||||
imap_security=MailAccount.ImapSecurity.SSL,
|
||||
character_set="UTF-8",
|
||||
)
|
||||
account2.owner = self.user
|
||||
account2.save()
|
||||
|
||||
account3 = MailAccount.objects.create(
|
||||
name="Email3",
|
||||
username="username3",
|
||||
password="password3",
|
||||
imap_server="server.example.com",
|
||||
imap_port=443,
|
||||
imap_security=MailAccount.ImapSecurity.SSL,
|
||||
character_set="UTF-8",
|
||||
)
|
||||
account3.owner = user2
|
||||
account3.save()
|
||||
|
||||
account4 = MailAccount.objects.create(
|
||||
name="Email4",
|
||||
username="username4",
|
||||
password="password4",
|
||||
imap_server="server.example.com",
|
||||
imap_port=443,
|
||||
imap_security=MailAccount.ImapSecurity.SSL,
|
||||
character_set="UTF-8",
|
||||
)
|
||||
account4.owner = user2
|
||||
account4.save()
|
||||
assign_perm("view_mailaccount", self.user, account4)
|
||||
|
||||
response = self.client.get(self.ENDPOINT)
|
||||
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 3)
|
||||
self.assertEqual(response.data["results"][0]["name"], account1.name)
|
||||
self.assertEqual(response.data["results"][1]["name"], account2.name)
|
||||
self.assertEqual(response.data["results"][2]["name"], account4.name)
|
||||
|
||||
|
||||
class TestAPIMailRules(DirectoriesMixin, APITestCase):
|
||||
ENDPOINT = "/api/mail_rules/"
|
||||
@@ -273,7 +344,9 @@ class TestAPIMailRules(DirectoriesMixin, APITestCase):
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
|
||||
self.user = User.objects.create_superuser(username="temp_admin")
|
||||
self.user = User.objects.create_user(username="temp_admin")
|
||||
self.user.user_permissions.add(*Permission.objects.all())
|
||||
self.user.save()
|
||||
self.client.force_authenticate(user=self.user)
|
||||
|
||||
def test_get_mail_rules(self):
|
||||
@@ -533,3 +606,72 @@ class TestAPIMailRules(DirectoriesMixin, APITestCase):
|
||||
returned_rule1 = MailRule.objects.get(pk=rule1.pk)
|
||||
self.assertEqual(returned_rule1.name, "Updated Name 1")
|
||||
self.assertEqual(returned_rule1.action, MailRule.MailAction.DELETE)
|
||||
|
||||
def test_get_mail_rules_owner_aware(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Configured rules with different users
|
||||
WHEN:
|
||||
- API call is made to get mail rules
|
||||
THEN:
|
||||
- Only unowned, owned by user or granted mail rules are provided
|
||||
"""
|
||||
|
||||
user2 = User.objects.create_user(username="temp_admin2")
|
||||
|
||||
account1 = MailAccount.objects.create(
|
||||
name="Email1",
|
||||
username="username1",
|
||||
password="password1",
|
||||
imap_server="server.example.com",
|
||||
imap_port=443,
|
||||
imap_security=MailAccount.ImapSecurity.SSL,
|
||||
character_set="UTF-8",
|
||||
)
|
||||
|
||||
rule1 = MailRule.objects.create(
|
||||
name="Rule1",
|
||||
account=account1,
|
||||
folder="INBOX",
|
||||
filter_from="from@example1.com",
|
||||
order=0,
|
||||
)
|
||||
|
||||
rule2 = MailRule.objects.create(
|
||||
name="Rule2",
|
||||
account=account1,
|
||||
folder="INBOX",
|
||||
filter_from="from@example2.com",
|
||||
order=1,
|
||||
)
|
||||
rule2.owner = self.user
|
||||
rule2.save()
|
||||
|
||||
rule3 = MailRule.objects.create(
|
||||
name="Rule3",
|
||||
account=account1,
|
||||
folder="INBOX",
|
||||
filter_from="from@example3.com",
|
||||
order=2,
|
||||
)
|
||||
rule3.owner = user2
|
||||
rule3.save()
|
||||
|
||||
rule4 = MailRule.objects.create(
|
||||
name="Rule4",
|
||||
account=account1,
|
||||
folder="INBOX",
|
||||
filter_from="from@example4.com",
|
||||
order=3,
|
||||
)
|
||||
rule4.owner = user2
|
||||
rule4.save()
|
||||
assign_perm("view_mailrule", self.user, rule4)
|
||||
|
||||
response = self.client.get(self.ENDPOINT)
|
||||
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
self.assertEqual(response.data["count"], 3)
|
||||
self.assertEqual(response.data["results"][0]["name"], rule1.name)
|
||||
self.assertEqual(response.data["results"][1]["name"], rule2.name)
|
||||
self.assertEqual(response.data["results"][2]["name"], rule4.name)
|
||||
|
@@ -7,6 +7,8 @@ from rest_framework.permissions import IsAuthenticated
|
||||
from rest_framework.response import Response
|
||||
from rest_framework.viewsets import ModelViewSet
|
||||
|
||||
from documents.filters import ObjectOwnedOrGrantedPermissionsFilter
|
||||
from documents.permissions import PaperlessObjectPermissions
|
||||
from documents.views import PassUserMixin
|
||||
from paperless.views import StandardPagination
|
||||
from paperless_mail.mail import MailError
|
||||
@@ -24,7 +26,8 @@ class MailAccountViewSet(ModelViewSet, PassUserMixin):
|
||||
queryset = MailAccount.objects.all().order_by("pk")
|
||||
serializer_class = MailAccountSerializer
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
permission_classes = (IsAuthenticated, PaperlessObjectPermissions)
|
||||
filter_backends = (ObjectOwnedOrGrantedPermissionsFilter,)
|
||||
|
||||
|
||||
class MailRuleViewSet(ModelViewSet, PassUserMixin):
|
||||
@@ -33,7 +36,8 @@ class MailRuleViewSet(ModelViewSet, PassUserMixin):
|
||||
queryset = MailRule.objects.all().order_by("order")
|
||||
serializer_class = MailRuleSerializer
|
||||
pagination_class = StandardPagination
|
||||
permission_classes = (IsAuthenticated,)
|
||||
permission_classes = (IsAuthenticated, PaperlessObjectPermissions)
|
||||
filter_backends = (ObjectOwnedOrGrantedPermissionsFilter,)
|
||||
|
||||
|
||||
class MailAccountTestView(GenericAPIView):
|
||||
|
@@ -861,8 +861,9 @@ class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(os.path.join(self.SAMPLE_FILES, "document.webp"), "image/webp")
|
||||
self.assertIsFile(parser.archive_path)
|
||||
# OCR consistent mangles this space, oh well
|
||||
self.assertIn(
|
||||
"this is awebp document, created 11/14/2022.",
|
||||
# Older tesseracts consistently mangle the space between "a webp",
|
||||
# tesseract 5.3.0 seems to do a better job, so we're accepting both
|
||||
self.assertRegex(
|
||||
parser.get_text().lower(),
|
||||
r"this is a ?webp document, created 11/14/2022.",
|
||||
)
|
||||
|
Reference in New Issue
Block a user