Merge remote-tracking branch 'paperless/dev' into feature-consume-eml

This commit is contained in:
phail
2022-10-23 20:37:22 +02:00
225 changed files with 19278 additions and 25141 deletions

View File

@@ -42,6 +42,7 @@ class DocumentAdmin(admin.ModelAdmin):
"checksum",
"archive_filename",
"archive_checksum",
"original_filename",
)
list_display_links = ("title",)

View File

@@ -3,12 +3,16 @@ import os
import shutil
import tempfile
from functools import lru_cache
from typing import List # for type hinting. Can be removed, if only Python >3.8 is used
from typing import List
from typing import Optional
from typing import Tuple
import magic
from django.conf import settings
from pdf2image import convert_from_path
from pikepdf import Page
from pikepdf import Pdf
from pikepdf import PdfImage
from PIL import Image
from PIL import ImageSequence
from pyzbar import pyzbar
@@ -16,6 +20,10 @@ from pyzbar import pyzbar
logger = logging.getLogger("paperless.barcodes")
class BarcodeImageFormatError(Exception):
pass
@lru_cache(maxsize=8)
def supported_file_type(mime_type) -> bool:
"""
@@ -31,7 +39,7 @@ def supported_file_type(mime_type) -> bool:
return mime_type in supported_mime
def barcode_reader(image) -> List[str]:
def barcode_reader(image: Image) -> List[str]:
"""
Read any barcodes contained in image
Returns a list containing all found barcodes
@@ -98,21 +106,66 @@ def convert_from_tiff_to_pdf(filepath: str) -> str:
return newpath
def scan_file_for_separating_barcodes(filepath: str) -> List[int]:
def scan_file_for_separating_barcodes(filepath: str) -> Tuple[Optional[str], List[int]]:
"""
Scan the provided pdf file for page separating barcodes
Returns a list of pagenumbers, which separate the file
Returns a PDF filepath and a list of pagenumbers,
which separate the file into new files
"""
def _pikepdf_barcode_scan(pdf_filepath: str):
with Pdf.open(pdf_filepath) as pdf:
for page_num, page in enumerate(pdf.pages):
for image_key in page.images:
pdfimage = PdfImage(page.images[image_key])
if "/CCITTFaxDecode" in pdfimage.filters:
raise BarcodeImageFormatError()
# Not all images can be transcoded to a PIL image, which
# is what pyzbar expects to receive
pillow_img = pdfimage.as_pil_image()
detected_barcodes = barcode_reader(pillow_img)
if settings.CONSUMER_BARCODE_STRING in detected_barcodes:
separator_page_numbers.append(page_num)
def _pdf2image_barcode_scan(pdf_filepath: str):
# use a temporary directory in case the file os too big to handle in memory
with tempfile.TemporaryDirectory() as path:
pages_from_path = convert_from_path(pdf_filepath, output_folder=path)
for current_page_number, page in enumerate(pages_from_path):
current_barcodes = barcode_reader(page)
if settings.CONSUMER_BARCODE_STRING in current_barcodes:
separator_page_numbers.append(current_page_number)
separator_page_numbers = []
separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
# use a temporary directory in case the file os too big to handle in memory
with tempfile.TemporaryDirectory() as path:
pages_from_path = convert_from_path(filepath, output_folder=path)
for current_page_number, page in enumerate(pages_from_path):
current_barcodes = barcode_reader(page)
if separator_barcode in current_barcodes:
separator_page_numbers.append(current_page_number)
return separator_page_numbers
pdf_filepath = None
mime_type = get_file_mime_type(filepath)
if supported_file_type(mime_type):
pdf_filepath = filepath
if mime_type == "image/tiff":
pdf_filepath = convert_from_tiff_to_pdf(filepath)
try:
_pikepdf_barcode_scan(pdf_filepath)
except Exception as e:
logger.warning(
f"Exception using pikepdf for barcodes, falling back to pdf2image: {e}",
)
# Reset this incase pikepdf got part way through
separator_page_numbers = []
_pdf2image_barcode_scan(pdf_filepath)
else:
logger.warning(
f"Unsupported file format for barcode reader: {str(mime_type)}",
)
return pdf_filepath, separator_page_numbers
def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
@@ -122,47 +175,56 @@ def separate_pages(filepath: str, pages_to_split_on: List[int]) -> List[str]:
Returns a list of (temporary) filepaths to consume.
These will need to be deleted later.
"""
document_paths = []
if not pages_to_split_on:
logger.warning("No pages to split on!")
return document_paths
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
fname = os.path.splitext(os.path.basename(filepath))[0]
pdf = Pdf.open(filepath)
document_paths = []
logger.debug(f"Temp dir is {str(tempdir)}")
if not pages_to_split_on:
logger.warning("No pages to split on!")
else:
# go from the first page to the first separator page
# A list of documents, ie a list of lists of pages
documents: List[List[Page]] = []
# A single document, ie a list of pages
document: List[Page] = []
for idx, page in enumerate(pdf.pages):
# Keep building the new PDF as long as it is not a
# separator index
if idx not in pages_to_split_on:
document.append(page)
# Make sure to append the very last document to the documents
if idx == (len(pdf.pages) - 1):
documents.append(document)
document = []
else:
# This is a split index, save the current PDF pages, and restart
# a new destination page listing
logger.debug(f"Starting new document at idx {idx}")
documents.append(document)
document = []
documents = [x for x in documents if len(x)]
logger.debug(f"Split into {len(documents)} new documents")
# Write the new documents out
for doc_idx, document in enumerate(documents):
dst = Pdf.new()
for n, page in enumerate(pdf.pages):
if n < pages_to_split_on[0]:
dst.pages.append(page)
output_filename = f"{fname}_document_0.pdf"
dst.pages.extend(document)
output_filename = f"{fname}_document_{doc_idx}.pdf"
logger.debug(f"pdf no:{doc_idx} has {len(dst.pages)} pages")
savepath = os.path.join(tempdir, output_filename)
with open(savepath, "wb") as out:
dst.save(out)
document_paths = [savepath]
document_paths.append(savepath)
# iterate through the rest of the document
for count, page_number in enumerate(pages_to_split_on):
logger.debug(f"Count: {str(count)} page_number: {str(page_number)}")
dst = Pdf.new()
try:
next_page = pages_to_split_on[count + 1]
except IndexError:
next_page = len(pdf.pages)
# skip the first page_number. This contains the barcode page
for page in range(page_number + 1, next_page):
logger.debug(
f"page_number: {str(page_number)} next_page: {str(next_page)}",
)
dst.pages.append(pdf.pages[page])
output_filename = f"{fname}_document_{str(count + 1)}.pdf"
logger.debug(f"pdf no:{str(count)} has {str(len(dst.pages))} pages")
savepath = os.path.join(tempdir, output_filename)
with open(savepath, "wb") as out:
dst.save(out)
document_paths.append(savepath)
logger.debug(f"Temp files are {str(document_paths)}")
return document_paths

View File

@@ -1,11 +1,12 @@
import itertools
from django.db.models import Q
from django_q.tasks import async_task
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import StoragePath
from documents.tasks import bulk_update_documents
from documents.tasks import update_document_archive_file
def set_correspondent(doc_ids, correspondent):
@@ -16,7 +17,7 @@ def set_correspondent(doc_ids, correspondent):
affected_docs = [doc.id for doc in qs]
qs.update(correspondent=correspondent)
async_task("documents.tasks.bulk_update_documents", document_ids=affected_docs)
bulk_update_documents.delay(document_ids=affected_docs)
return "OK"
@@ -31,8 +32,7 @@ def set_storage_path(doc_ids, storage_path):
affected_docs = [doc.id for doc in qs]
qs.update(storage_path=storage_path)
async_task(
"documents.tasks.bulk_update_documents",
bulk_update_documents.delay(
document_ids=affected_docs,
)
@@ -47,7 +47,7 @@ def set_document_type(doc_ids, document_type):
affected_docs = [doc.id for doc in qs]
qs.update(document_type=document_type)
async_task("documents.tasks.bulk_update_documents", document_ids=affected_docs)
bulk_update_documents.delay(document_ids=affected_docs)
return "OK"
@@ -63,7 +63,7 @@ def add_tag(doc_ids, tag):
[DocumentTagRelationship(document_id=doc, tag_id=tag) for doc in affected_docs],
)
async_task("documents.tasks.bulk_update_documents", document_ids=affected_docs)
bulk_update_documents.delay(document_ids=affected_docs)
return "OK"
@@ -79,7 +79,7 @@ def remove_tag(doc_ids, tag):
Q(document_id__in=affected_docs) & Q(tag_id=tag),
).delete()
async_task("documents.tasks.bulk_update_documents", document_ids=affected_docs)
bulk_update_documents.delay(document_ids=affected_docs)
return "OK"
@@ -103,7 +103,7 @@ def modify_tags(doc_ids, add_tags, remove_tags):
ignore_conflicts=True,
)
async_task("documents.tasks.bulk_update_documents", document_ids=affected_docs)
bulk_update_documents.delay(document_ids=affected_docs)
return "OK"
@@ -122,6 +122,9 @@ def delete(doc_ids):
def redo_ocr(doc_ids):
async_task("documents.tasks.redo_ocr", document_ids=doc_ids)
for document_id in doc_ids:
update_document_archive_file.delay(
document_id=document_id,
)
return "OK"

View File

@@ -5,12 +5,15 @@ import pickle
import re
import shutil
import warnings
from typing import List
from typing import Optional
from django.conf import settings
from documents.models import Document
from documents.models import MatchingModel
logger = logging.getLogger("paperless.classifier")
class IncompatibleClassifierVersionError(Exception):
pass
@@ -20,15 +23,6 @@ class ClassifierModelCorruptError(Exception):
pass
logger = logging.getLogger("paperless.classifier")
def preprocess_content(content: str) -> str:
content = content.lower().strip()
content = re.sub(r"\s+", " ", content)
return content
def load_classifier() -> Optional["DocumentClassifier"]:
if not os.path.isfile(settings.MODEL_FILE):
logger.debug(
@@ -81,6 +75,9 @@ class DocumentClassifier:
self.document_type_classifier = None
self.storage_path_classifier = None
self._stemmer = None
self._stop_words = None
def load(self):
# Catch warnings for processing
with warnings.catch_warnings(record=True) as w:
@@ -101,8 +98,8 @@ class DocumentClassifier:
self.correspondent_classifier = pickle.load(f)
self.document_type_classifier = pickle.load(f)
self.storage_path_classifier = pickle.load(f)
except Exception:
raise ClassifierModelCorruptError()
except Exception as err:
raise ClassifierModelCorruptError() from err
# Check for the warning about unpickling from differing versions
# and consider it incompatible
@@ -139,11 +136,11 @@ class DocumentClassifier:
def train(self):
data = list()
labels_tags = list()
labels_correspondent = list()
labels_document_type = list()
labels_storage_path = list()
data = []
labels_tags = []
labels_correspondent = []
labels_document_type = []
labels_storage_path = []
# Step 1: Extract and preprocess training data from the database.
logger.debug("Gathering data from database...")
@@ -151,7 +148,7 @@ class DocumentClassifier:
for doc in Document.objects.order_by("pk").exclude(
tags__is_inbox_tag=True,
):
preprocessed_content = preprocess_content(doc.content)
preprocessed_content = self.preprocess_content(doc.content)
m.update(preprocessed_content.encode("utf-8"))
data.append(preprocessed_content)
@@ -231,6 +228,11 @@ class DocumentClassifier:
)
data_vectorized = self.data_vectorizer.fit_transform(data)
# See the notes here:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html # noqa: 501
# This attribute isn't needed to function and can be large
self.data_vectorizer.stop_words_ = None
# Step 3: train the classifiers
if num_tags > 0:
logger.debug("Training tags classifier...")
@@ -296,9 +298,52 @@ class DocumentClassifier:
return True
def preprocess_content(self, content: str) -> str:
"""
Process to contents of a document, distilling it down into
words which are meaningful to the content
"""
# Lower case the document
content = content.lower().strip()
# Reduce spaces
content = re.sub(r"\s+", " ", content)
# Get only the letters
content = re.sub(r"[^\w\s]", " ", content)
# If the NLTK language is supported, do further processing
if settings.NLTK_LANGUAGE is not None and settings.NLTK_ENABLED:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
# Not really hacky, since it isn't private and is documented, but
# set the search path for NLTK data to the single location it should be in
nltk.data.path = [settings.NLTK_DIR]
# Do some one time setup
if self._stemmer is None:
self._stemmer = SnowballStemmer(settings.NLTK_LANGUAGE)
if self._stop_words is None:
self._stop_words = set(stopwords.words(settings.NLTK_LANGUAGE))
# Tokenize
words: List[str] = word_tokenize(content, language=settings.NLTK_LANGUAGE)
# Remove stop words
meaningful_words = [w for w in words if w not in self._stop_words]
# Stem words
meaningful_words = [self._stemmer.stem(w) for w in meaningful_words]
return " ".join(meaningful_words)
return content
def predict_correspondent(self, content):
if self.correspondent_classifier:
X = self.data_vectorizer.transform([preprocess_content(content)])
X = self.data_vectorizer.transform([self.preprocess_content(content)])
correspondent_id = self.correspondent_classifier.predict(X)
if correspondent_id != -1:
return correspondent_id
@@ -309,7 +354,7 @@ class DocumentClassifier:
def predict_document_type(self, content):
if self.document_type_classifier:
X = self.data_vectorizer.transform([preprocess_content(content)])
X = self.data_vectorizer.transform([self.preprocess_content(content)])
document_type_id = self.document_type_classifier.predict(X)
if document_type_id != -1:
return document_type_id
@@ -322,7 +367,7 @@ class DocumentClassifier:
from sklearn.utils.multiclass import type_of_target
if self.tags_classifier:
X = self.data_vectorizer.transform([preprocess_content(content)])
X = self.data_vectorizer.transform([self.preprocess_content(content)])
y = self.tags_classifier.predict(X)
tags_ids = self.tags_binarizer.inverse_transform(y)[0]
if type_of_target(y).startswith("multilabel"):
@@ -341,7 +386,7 @@ class DocumentClassifier:
def predict_storage_path(self, content):
if self.storage_path_classifier:
X = self.data_vectorizer.transform([preprocess_content(content)])
X = self.data_vectorizer.transform([self.preprocess_content(content)])
storage_path_id = self.storage_path_classifier.predict(X)
if storage_path_id != -1:
return storage_path_id

View File

@@ -78,10 +78,16 @@ class Consumer(LoggingMixin):
{"type": "status_update", "data": payload},
)
def _fail(self, message, log_message=None, exc_info=None):
def _fail(
self,
message,
log_message=None,
exc_info=None,
exception: Optional[Exception] = None,
):
self._send_progress(100, 100, "FAILED", message)
self.log("error", log_message or message, exc_info=exc_info)
raise ConsumerError(f"{self.filename}: {log_message or message}")
raise ConsumerError(f"{self.filename}: {log_message or message}") from exception
def __init__(self):
super().__init__()
@@ -105,14 +111,16 @@ class Consumer(LoggingMixin):
def pre_check_duplicate(self):
with open(self.path, "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
if Document.objects.filter(
existing_doc = Document.objects.filter(
Q(checksum=checksum) | Q(archive_checksum=checksum),
).exists():
)
if existing_doc.exists():
if settings.CONSUMER_DELETE_DUPLICATES:
os.unlink(self.path)
self._fail(
MESSAGE_DOCUMENT_ALREADY_EXISTS,
f"Not consuming {self.filename}: It is a duplicate.",
f"Not consuming {self.filename}: It is a duplicate of"
f" {existing_doc.get().title} (#{existing_doc.get().pk})",
)
def pre_check_directories(self):
@@ -134,13 +142,25 @@ class Consumer(LoggingMixin):
self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
filepath_arg = os.path.normpath(self.path)
script_env = os.environ.copy()
script_env["DOCUMENT_SOURCE_PATH"] = filepath_arg
try:
Popen((settings.PRE_CONSUME_SCRIPT, self.path)).wait()
Popen(
(
settings.PRE_CONSUME_SCRIPT,
filepath_arg,
),
env=script_env,
).wait()
except Exception as e:
self._fail(
MESSAGE_PRE_CONSUME_SCRIPT_ERROR,
f"Error while executing pre-consume script: {e}",
exc_info=True,
exception=e,
)
def run_post_consume_script(self, document):
@@ -159,6 +179,34 @@ class Consumer(LoggingMixin):
f"Executing post-consume script {settings.POST_CONSUME_SCRIPT}",
)
script_env = os.environ.copy()
script_env["DOCUMENT_ID"] = str(document.pk)
script_env["DOCUMENT_CREATED"] = str(document.created)
script_env["DOCUMENT_MODIFIED"] = str(document.modified)
script_env["DOCUMENT_ADDED"] = str(document.added)
script_env["DOCUMENT_FILE_NAME"] = document.get_public_filename()
script_env["DOCUMENT_SOURCE_PATH"] = os.path.normpath(document.source_path)
script_env["DOCUMENT_ARCHIVE_PATH"] = os.path.normpath(
str(document.archive_path),
)
script_env["DOCUMENT_THUMBNAIL_PATH"] = os.path.normpath(
document.thumbnail_path,
)
script_env["DOCUMENT_DOWNLOAD_URL"] = reverse(
"document-download",
kwargs={"pk": document.pk},
)
script_env["DOCUMENT_THUMBNAIL_URL"] = reverse(
"document-thumb",
kwargs={"pk": document.pk},
)
script_env["DOCUMENT_CORRESPONDENT"] = str(document.correspondent)
script_env["DOCUMENT_TAGS"] = str(
",".join(document.tags.all().values_list("name", flat=True)),
)
script_env["DOCUMENT_ORIGINAL_FILENAME"] = str(document.original_filename)
try:
Popen(
(
@@ -172,12 +220,14 @@ class Consumer(LoggingMixin):
str(document.correspondent),
str(",".join(document.tags.all().values_list("name", flat=True))),
),
env=script_env,
).wait()
except Exception as e:
self._fail(
MESSAGE_POST_CONSUME_SCRIPT_ERROR,
f"Error while executing post-consume script: {e}",
exc_info=True,
exception=e,
)
def try_consume_file(
@@ -292,6 +342,7 @@ class Consumer(LoggingMixin):
str(e),
f"Error while consuming document {self.filename}: {e}",
exc_info=True,
exception=e,
)
# Prepare the document classifier.
@@ -376,6 +427,7 @@ class Consumer(LoggingMixin):
f"The following error occurred while consuming "
f"{self.filename}: {e}",
exc_info=True,
exception=e,
)
finally:
document_parser.cleanup()
@@ -426,6 +478,7 @@ class Consumer(LoggingMixin):
created=create_date,
modified=create_date,
storage_type=storage_type,
original_filename=self.filename,
)
self.apply_overrides(document)

View File

@@ -1,85 +1,18 @@
import hashlib
import logging
import multiprocessing
import os
import shutil
import uuid
import tqdm
from django import db
from django.conf import settings
from django.core.management.base import BaseCommand
from django.db import transaction
from documents.models import Document
from filelock import FileLock
from ... import index
from ...file_handling import create_source_path_directory
from ...file_handling import generate_unique_filename
from ...parsers import get_parser_class_for_mime_type
from documents.tasks import update_document_archive_file
logger = logging.getLogger("paperless.management.archiver")
def handle_document(document_id):
document = Document.objects.get(id=document_id)
mime_type = document.mime_type
parser_class = get_parser_class_for_mime_type(mime_type)
if not parser_class:
logger.error(
f"No parser found for mime type {mime_type}, cannot "
f"archive document {document} (ID: {document_id})",
)
return
parser = parser_class(logging_group=uuid.uuid4())
try:
parser.parse(document.source_path, mime_type, document.get_public_filename())
thumbnail = parser.get_thumbnail(
document.source_path,
mime_type,
document.get_public_filename(),
)
if parser.get_archive_path():
with transaction.atomic():
with open(parser.get_archive_path(), "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
# I'm going to save first so that in case the file move
# fails, the database is rolled back.
# We also don't use save() since that triggers the filehandling
# logic, and we don't want that yet (file not yet in place)
document.archive_filename = generate_unique_filename(
document,
archive_filename=True,
)
Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum,
content=parser.get_text(),
archive_filename=document.archive_filename,
)
with FileLock(settings.MEDIA_LOCK):
create_source_path_directory(document.archive_path)
shutil.move(parser.get_archive_path(), document.archive_path)
shutil.move(thumbnail, document.thumbnail_path)
with index.open_index_writer() as writer:
index.update_document(writer, document)
except Exception:
logger.exception(
f"Error while parsing document {document} " f"(ID: {document_id})",
)
finally:
parser.cleanup()
class Command(BaseCommand):
help = """
@@ -146,7 +79,7 @@ class Command(BaseCommand):
with multiprocessing.Pool(processes=settings.TASK_WORKERS) as pool:
list(
tqdm.tqdm(
pool.imap_unordered(handle_document, document_ids),
pool.imap_unordered(update_document_archive_file, document_ids),
total=len(document_ids),
disable=options["no_progress_bar"],
),

View File

@@ -2,6 +2,7 @@ import logging
import os
from pathlib import Path
from pathlib import PurePath
from threading import Event
from threading import Thread
from time import monotonic
from time import sleep
@@ -10,9 +11,9 @@ from typing import Final
from django.conf import settings
from django.core.management.base import BaseCommand
from django.core.management.base import CommandError
from django_q.tasks import async_task
from documents.models import Tag
from documents.parsers import is_file_ext_supported
from documents.tasks import consume_file
from watchdog.events import FileSystemEventHandler
from watchdog.observers.polling import PollingObserver
@@ -91,11 +92,9 @@ def _consume(filepath):
try:
logger.info(f"Adding {filepath} to the task queue.")
async_task(
"documents.tasks.consume_file",
consume_file.delay(
filepath,
override_tag_ids=tag_ids if tag_ids else None,
task_name=os.path.basename(filepath)[:100],
)
except Exception:
# Catch all so that the consumer won't crash.
@@ -148,9 +147,11 @@ class Command(BaseCommand):
"""
# This is here primarily for the tests and is irrelevant in production.
stop_flag = False
observer = None
stop_flag = Event()
# Also only for testing, configures in one place the timeout used before checking
# the stop flag
testing_timeout_s: Final[float] = 0.5
testing_timeout_ms: Final[float] = testing_timeout_s * 1000.0
def add_arguments(self, parser):
parser.add_argument(
@@ -161,6 +162,16 @@ class Command(BaseCommand):
)
parser.add_argument("--oneshot", action="store_true", help="Run only once.")
# Only use during unit testing, will configure a timeout
# Leaving it unset or false and the consumer will exit when it
# receives SIGINT
parser.add_argument(
"--testing",
action="store_true",
help="Flag used only for unit testing",
default=False,
)
def handle(self, *args, **options):
directory = options["directory"]
recursive = settings.CONSUMER_RECURSIVE
@@ -186,29 +197,40 @@ class Command(BaseCommand):
return
if settings.CONSUMER_POLLING == 0 and INotify:
self.handle_inotify(directory, recursive)
self.handle_inotify(directory, recursive, options["testing"])
else:
self.handle_polling(directory, recursive)
self.handle_polling(directory, recursive, options["testing"])
logger.debug("Consumer exiting.")
def handle_polling(self, directory, recursive):
def handle_polling(self, directory, recursive, is_testing: bool):
logger.info(f"Polling directory for changes: {directory}")
self.observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
self.observer.schedule(Handler(), directory, recursive=recursive)
self.observer.start()
try:
while self.observer.is_alive():
self.observer.join(1)
if self.stop_flag:
self.observer.stop()
except KeyboardInterrupt:
self.observer.stop()
self.observer.join()
def handle_inotify(self, directory, recursive):
timeout = None
if is_testing:
timeout = self.testing_timeout_s
logger.debug(f"Configuring timeout to {timeout}s")
observer = PollingObserver(timeout=settings.CONSUMER_POLLING)
observer.schedule(Handler(), directory, recursive=recursive)
observer.start()
try:
while observer.is_alive():
observer.join(timeout)
if self.stop_flag.is_set():
observer.stop()
except KeyboardInterrupt:
observer.stop()
observer.join()
def handle_inotify(self, directory, recursive, is_testing: bool):
logger.info(f"Using inotify to watch directory for changes: {directory}")
timeout = None
if is_testing:
timeout = self.testing_timeout_ms
logger.debug(f"Configuring timeout to {timeout}ms")
inotify = INotify()
inotify_flags = flags.CLOSE_WRITE | flags.MOVED_TO
if recursive:
@@ -216,14 +238,15 @@ class Command(BaseCommand):
else:
descriptor = inotify.add_watch(directory, inotify_flags)
try:
inotify_debounce: Final[float] = settings.CONSUMER_INOTIFY_DELAY
inotify_debounce: Final[float] = settings.CONSUMER_INOTIFY_DELAY
notified_files = {}
finished = False
while not self.stop_flag:
notified_files = {}
for event in inotify.read(timeout=1000):
while not finished:
try:
for event in inotify.read(timeout=timeout):
if recursive:
path = inotify.get_path(event.wd)
else:
@@ -256,8 +279,22 @@ class Command(BaseCommand):
# These files are still waiting to hit the timeout
notified_files = still_waiting
except KeyboardInterrupt:
pass
# If files are waiting, need to exit read() to check them
# Otherwise, go back to infinite sleep time, but only if not testing
if len(notified_files) > 0:
timeout = inotify_debounce
elif is_testing:
timeout = self.testing_timeout_ms
else:
timeout = None
if self.stop_flag.is_set():
logger.debug("Finishing because event is set")
finished = True
except KeyboardInterrupt:
logger.info("Received SIGINT, stopping inotify")
finished = True
inotify.rm_watch(descriptor)
inotify.close()

View File

@@ -12,11 +12,13 @@ from django.core import serializers
from django.core.management.base import BaseCommand
from django.core.management.base import CommandError
from django.db import transaction
from documents.models import Comment
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import SavedView
from documents.models import SavedViewFilterRule
from documents.models import StoragePath
from documents.models import Tag
from documents.models import UiSettings
from documents.settings import EXPORTER_ARCHIVE_NAME
@@ -113,8 +115,8 @@ class Command(BaseCommand):
map(lambda f: os.path.abspath(os.path.join(root, f)), files),
)
# 2. Create manifest, containing all correspondents, types, tags,
# documents and ui_settings
# 2. Create manifest, containing all correspondents, types, tags, storage paths
# comments, documents and ui_settings
with transaction.atomic():
manifest = json.loads(
serializers.serialize("json", Correspondent.objects.all()),
@@ -126,6 +128,14 @@ class Command(BaseCommand):
serializers.serialize("json", DocumentType.objects.all()),
)
manifest += json.loads(
serializers.serialize("json", StoragePath.objects.all()),
)
manifest += json.loads(
serializers.serialize("json", Comment.objects.all()),
)
documents = Document.objects.order_by("id")
document_map = {d.pk: d for d in documents}
document_manifest = json.loads(serializers.serialize("json", documents))

View File

@@ -3,6 +3,7 @@ import logging
import os
import shutil
from contextlib import contextmanager
from pathlib import Path
import tqdm
from django.conf import settings
@@ -14,6 +15,7 @@ from django.core.serializers.base import DeserializationError
from django.db.models.signals import m2m_changed
from django.db.models.signals import post_save
from documents.models import Document
from documents.parsers import run_convert
from documents.settings import EXPORTER_ARCHIVE_NAME
from documents.settings import EXPORTER_FILE_NAME
from documents.settings import EXPORTER_THUMBNAIL_NAME
@@ -192,7 +194,7 @@ class Command(BaseCommand):
document_path = os.path.join(self.source, doc_file)
thumb_file = record[EXPORTER_THUMBNAIL_NAME]
thumbnail_path = os.path.join(self.source, thumb_file)
thumbnail_path = Path(os.path.join(self.source, thumb_file)).resolve()
if EXPORTER_ARCHIVE_NAME in record:
archive_file = record[EXPORTER_ARCHIVE_NAME]
@@ -209,7 +211,20 @@ class Command(BaseCommand):
create_source_path_directory(document.source_path)
shutil.copy2(document_path, document.source_path)
shutil.copy2(thumbnail_path, document.thumbnail_path)
if thumbnail_path.suffix in {".png", ".PNG"}:
run_convert(
density=300,
scale="500x5000>",
alpha="remove",
strip=True,
trim=False,
auto_orient=True,
input_file=f"{thumbnail_path}[0]",
output_file=str(document.thumbnail_path),
)
else:
shutil.copy2(thumbnail_path, document.thumbnail_path)
if archive_path:
create_source_path_directory(document.archive_path)
# TODO: this assumes that the export is valid and

View File

@@ -1,35 +0,0 @@
import tqdm
from django.core.management.base import BaseCommand
from documents.tasks import redo_ocr
class Command(BaseCommand):
help = """
This will rename all documents to match the latest filename format.
""".replace(
" ",
"",
)
def add_arguments(self, parser):
parser.add_argument(
"--no-progress-bar",
default=False,
action="store_true",
help="If set, the progress bar will not be shown",
)
parser.add_argument(
"documents",
nargs="+",
help="Document primary keys for re-processing OCR on",
)
def handle(self, *args, **options):
doc_pks = tqdm.tqdm(
options["documents"],
disable=options["no_progress_bar"],
)
redo_ocr(doc_pks)

View File

@@ -7,6 +7,7 @@ from documents.models import Document
from ...signals.handlers import set_correspondent
from ...signals.handlers import set_document_type
from ...signals.handlers import set_storage_path
from ...signals.handlers import set_tags
@@ -29,6 +30,7 @@ class Command(BaseCommand):
parser.add_argument("-c", "--correspondent", default=False, action="store_true")
parser.add_argument("-T", "--tags", default=False, action="store_true")
parser.add_argument("-t", "--document_type", default=False, action="store_true")
parser.add_argument("-s", "--storage_path", default=False, action="store_true")
parser.add_argument("-i", "--inbox-only", default=False, action="store_true")
parser.add_argument(
"--use-first",
@@ -112,3 +114,14 @@ class Command(BaseCommand):
base_url=options["base_url"],
color=color,
)
if options["storage_path"]:
set_storage_path(
sender=None,
document=document,
classifier=classifier,
replace=options["overwrite"],
use_first=options["use_first"],
suggest=options["suggest"],
base_url=options["base_url"],
color=color,
)

View File

@@ -1,34 +1,14 @@
# Generated by Django 3.1.3 on 2020-11-09 16:36
from django.db import migrations
from django.db.migrations import RunPython
from django_q.models import Schedule
from django_q.tasks import schedule
def add_schedules(apps, schema_editor):
schedule(
"documents.tasks.train_classifier",
name="Train the classifier",
schedule_type=Schedule.HOURLY,
)
schedule(
"documents.tasks.index_optimize",
name="Optimize the index",
schedule_type=Schedule.DAILY,
)
def remove_schedules(apps, schema_editor):
Schedule.objects.filter(func="documents.tasks.train_classifier").delete()
Schedule.objects.filter(func="documents.tasks.index_optimize").delete()
class Migration(migrations.Migration):
dependencies = [
("documents", "1000_update_paperless_all"),
("django_q", "0013_task_attempt_count"),
]
operations = [RunPython(add_schedules, remove_schedules)]
operations = [
migrations.RunPython(migrations.RunPython.noop, migrations.RunPython.noop)
]

View File

@@ -2,27 +2,12 @@
from django.db import migrations
from django.db.migrations import RunPython
from django_q.models import Schedule
from django_q.tasks import schedule
def add_schedules(apps, schema_editor):
schedule(
"documents.tasks.sanity_check",
name="Perform sanity check",
schedule_type=Schedule.WEEKLY,
)
def remove_schedules(apps, schema_editor):
Schedule.objects.filter(func="documents.tasks.sanity_check").delete()
class Migration(migrations.Migration):
dependencies = [
("documents", "1003_mime_types"),
("django_q", "0013_task_attempt_count"),
]
operations = [RunPython(add_schedules, remove_schedules)]
operations = [RunPython(migrations.RunPython.noop, migrations.RunPython.noop)]

View File

@@ -4,28 +4,9 @@ from django.db import migrations, models
import django.db.models.deletion
def init_paperless_tasks(apps, schema_editor):
PaperlessTask = apps.get_model("documents", "PaperlessTask")
Task = apps.get_model("django_q", "Task")
for task in Task.objects.filter(func="documents.tasks.consume_file"):
if not hasattr(task, "paperlesstask"):
paperlesstask = PaperlessTask.objects.create(
attempted_task=task,
task_id=task.id,
name=task.name,
created=task.started,
started=task.started,
acknowledged=True,
)
task.paperlesstask = paperlesstask
task.save()
class Migration(migrations.Migration):
dependencies = [
("django_q", "0014_schedule_cluster"),
("documents", "1021_webp_thumbnail_conversion"),
]
@@ -60,10 +41,12 @@ class Migration(migrations.Migration):
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="attempted_task",
to="django_q.task",
# This is a dummy field, 1026 will fix up the column
# This manual change is required, as django doesn't django doesn't really support
# removing an app which has migration deps like this
to="documents.document",
),
),
],
),
migrations.RunPython(init_paperless_tasks, migrations.RunPython.noop),
)
]

View File

@@ -0,0 +1,69 @@
from django.db import migrations, models
import django.utils.timezone
from django.conf import settings
class Migration(migrations.Migration):
dependencies = [
("documents", "1022_paperlesstask"),
]
operations = [
migrations.CreateModel(
name="Comment",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"comment",
models.TextField(
blank=True,
help_text="Comment for the document",
verbose_name="content",
),
),
(
"created",
models.DateTimeField(
db_index=True,
default=django.utils.timezone.now,
verbose_name="created",
),
),
(
"document",
models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="documents",
to="documents.document",
verbose_name="document",
),
),
(
"user",
models.ForeignKey(
blank=True,
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="users",
to=settings.AUTH_USER_MODEL,
verbose_name="user",
),
),
],
options={
"verbose_name": "comment",
"verbose_name_plural": "comments",
"ordering": ("created",),
},
),
]

View File

@@ -0,0 +1,25 @@
# Generated by Django 4.0.6 on 2022-07-25 06:34
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("documents", "1023_add_comments"),
]
operations = [
migrations.AddField(
model_name="document",
name="original_filename",
field=models.CharField(
default=None,
editable=False,
help_text="The original name of the file when it was uploaded",
max_length=1024,
null=True,
verbose_name="original filename",
),
),
]

View File

@@ -0,0 +1,48 @@
# Generated by Django 4.0.5 on 2022-08-26 16:49
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("documents", "1024_document_original_filename"),
]
operations = [
migrations.AlterField(
model_name="savedviewfilterrule",
name="rule_type",
field=models.PositiveIntegerField(
choices=[
(0, "title contains"),
(1, "content contains"),
(2, "ASN is"),
(3, "correspondent is"),
(4, "document type is"),
(5, "is in inbox"),
(6, "has tag"),
(7, "has any tag"),
(8, "created before"),
(9, "created after"),
(10, "created year is"),
(11, "created month is"),
(12, "created day is"),
(13, "added before"),
(14, "added after"),
(15, "modified before"),
(16, "modified after"),
(17, "does not have tag"),
(18, "does not have ASN"),
(19, "title or content contains"),
(20, "fulltext query"),
(21, "more like this"),
(22, "has tags in"),
(23, "ASN greater than"),
(24, "ASN less than"),
(25, "storage path is"),
],
verbose_name="rule type",
),
),
]

View File

@@ -0,0 +1,57 @@
# Generated by Django 4.1.1 on 2022-09-27 19:31
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
("django_celery_results", "0011_taskresult_periodic_task_name"),
("documents", "1025_alter_savedviewfilterrule_rule_type"),
]
operations = [
migrations.RemoveField(
model_name="paperlesstask",
name="created",
),
migrations.RemoveField(
model_name="paperlesstask",
name="name",
),
migrations.RemoveField(
model_name="paperlesstask",
name="started",
),
# Remove the field from the model
migrations.RemoveField(
model_name="paperlesstask",
name="attempted_task",
),
# Add the field back, pointing to the correct model
# This resolves a problem where the temporary change in 1022
# results in a type mismatch
migrations.AddField(
model_name="paperlesstask",
name="attempted_task",
field=models.OneToOneField(
blank=True,
null=True,
on_delete=django.db.models.deletion.CASCADE,
related_name="attempted_task",
to="django_celery_results.taskresult",
),
),
# Drop the django-q tables entirely
# Must be done last or there could be references here
migrations.RunSQL(
"DROP TABLE IF EXISTS django_q_ormq", reverse_sql=migrations.RunSQL.noop
),
migrations.RunSQL(
"DROP TABLE IF EXISTS django_q_schedule", reverse_sql=migrations.RunSQL.noop
),
migrations.RunSQL(
"DROP TABLE IF EXISTS django_q_task", reverse_sql=migrations.RunSQL.noop
),
]

View File

@@ -12,7 +12,7 @@ from django.contrib.auth.models import User
from django.db import models
from django.utils import timezone
from django.utils.translation import gettext_lazy as _
from django_q.tasks import Task
from django_celery_results.models import TaskResult
from documents.parsers import get_default_file_extension
@@ -214,6 +214,16 @@ class Document(models.Model):
help_text=_("Current archive filename in storage"),
)
original_filename = models.CharField(
_("original filename"),
max_length=1024,
editable=False,
default=None,
unique=False,
null=True,
help_text=_("The original name of the file when it was uploaded"),
)
archive_serial_number = models.IntegerField(
_("archive serial number"),
blank=True,
@@ -394,6 +404,9 @@ class SavedViewFilterRule(models.Model):
(20, _("fulltext query")),
(21, _("more like this")),
(22, _("has tags in")),
(23, _("ASN greater than")),
(24, _("ASN less than")),
(25, _("storage path is")),
]
saved_view = models.ForeignKey(
@@ -514,16 +527,53 @@ class UiSettings(models.Model):
class PaperlessTask(models.Model):
task_id = models.CharField(max_length=128)
name = models.CharField(max_length=256)
created = models.DateTimeField(_("created"), auto_now=True)
started = models.DateTimeField(_("started"), null=True)
acknowledged = models.BooleanField(default=False)
attempted_task = models.OneToOneField(
Task,
TaskResult,
on_delete=models.CASCADE,
related_name="attempted_task",
null=True,
blank=True,
)
acknowledged = models.BooleanField(default=False)
class Comment(models.Model):
comment = models.TextField(
_("content"),
blank=True,
help_text=_("Comment for the document"),
)
created = models.DateTimeField(
_("created"),
default=timezone.now,
db_index=True,
)
document = models.ForeignKey(
Document,
blank=True,
null=True,
related_name="documents",
on_delete=models.CASCADE,
verbose_name=_("document"),
)
user = models.ForeignKey(
User,
blank=True,
null=True,
related_name="users",
on_delete=models.SET_NULL,
verbose_name=_("user"),
)
class Meta:
ordering = ("created",)
verbose_name = _("comment")
verbose_name_plural = _("comments")
def __str__(self):
return self.content

View File

@@ -6,6 +6,8 @@ import re
import shutil
import subprocess
import tempfile
from typing import Iterator
from typing import Match
from typing import Optional
from typing import Set
@@ -216,6 +218,10 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
def parse_date(filename, text) -> Optional[datetime.datetime]:
return next(parse_date_generator(filename, text), None)
def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
"""
Returns the date of the document.
"""
@@ -246,38 +252,32 @@ def parse_date(filename, text) -> Optional[datetime.datetime]:
return date
return None
date = None
def __process_match(
match: Match[str],
date_order: str,
) -> Optional[datetime.datetime]:
date_string = match.group(0)
try:
date = __parser(date_string, date_order)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
date = None
return __filter(date)
def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]:
for m in re.finditer(DATE_REGEX, content):
date = __process_match(m, date_order)
if date is not None:
yield date
# if filename date parsing is enabled, search there first:
if settings.FILENAME_DATE_ORDER:
for m in re.finditer(DATE_REGEX, filename):
date_string = m.group(0)
try:
date = __parser(date_string, settings.FILENAME_DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
date = __filter(date)
if date is not None:
return date
yield from __process_content(filename, settings.FILENAME_DATE_ORDER)
# Iterate through all regex matches in text and try to parse the date
for m in re.finditer(DATE_REGEX, text):
date_string = m.group(0)
try:
date = __parser(date_string, settings.DATE_ORDER)
except (TypeError, ValueError):
# Skip all matches that do not parse to a proper date
continue
date = __filter(date)
if date is not None:
return date
return date
yield from __process_content(text, settings.DATE_ORDER)
class ParseError(Exception):

View File

@@ -1,6 +1,14 @@
import datetime
import math
import re
from ast import literal_eval
from asyncio.log import logger
from pathlib import Path
from typing import Dict
from typing import Optional
from typing import Tuple
from celery import states
try:
import zoneinfo
@@ -18,12 +26,12 @@ from .models import Correspondent
from .models import Document
from .models import DocumentType
from .models import MatchingModel
from .models import PaperlessTask
from .models import SavedView
from .models import SavedViewFilterRule
from .models import StoragePath
from .models import Tag
from .models import UiSettings
from .models import PaperlessTask
from .parsers import is_mime_type_supported
@@ -240,7 +248,8 @@ class DocumentSerializer(DynamicFieldsModelSerializer):
)
instance.created = new_datetime
instance.save()
validated_data.pop("created_date")
if "created_date" in validated_data:
validated_data.pop("created_date")
super().update(instance, validated_data)
return instance
@@ -607,6 +616,15 @@ class UiSettingsViewSerializer(serializers.ModelSerializer):
"settings",
]
def validate_settings(self, settings):
# we never save update checking backend setting
if "update_checking" in settings:
try:
settings["update_checking"].pop("backend_setting")
except KeyError:
pass
return settings
def create(self, validated_data):
ui_settings = UiSettings.objects.update_or_create(
user=validated_data.get("user"),
@@ -619,7 +637,19 @@ class TasksViewSerializer(serializers.ModelSerializer):
class Meta:
model = PaperlessTask
depth = 1
fields = "__all__"
fields = (
"id",
"task_id",
"date_created",
"date_done",
"type",
"status",
"result",
"acknowledged",
"task_name",
"name",
"related_document",
)
type = serializers.SerializerMethodField()
@@ -631,24 +661,108 @@ class TasksViewSerializer(serializers.ModelSerializer):
def get_result(self, obj):
result = ""
if hasattr(obj, "attempted_task") and obj.attempted_task:
result = obj.attempted_task.result
if (
hasattr(obj, "attempted_task")
and obj.attempted_task
and obj.attempted_task.result
):
try:
result: str = obj.attempted_task.result
if "exc_message" in result:
# This is a dict in this case
result: Dict = literal_eval(result)
# This is a list, grab the first item (most recent)
result = result["exc_message"][0]
except Exception as e: # pragma: no cover
# Extra security if something is malformed
logger.warn(f"Error getting task result: {e}", exc_info=True)
return result
status = serializers.SerializerMethodField()
def get_status(self, obj):
if obj.attempted_task is None:
if obj.started:
return "started"
else:
return "queued"
elif obj.attempted_task.success:
return "complete"
elif not obj.attempted_task.success:
return "failed"
else:
return "unknown"
result = "unknown"
if hasattr(obj, "attempted_task") and obj.attempted_task:
result = obj.attempted_task.status
return result
date_created = serializers.SerializerMethodField()
def get_date_created(self, obj):
result = ""
if hasattr(obj, "attempted_task") and obj.attempted_task:
result = obj.attempted_task.date_created
return result
date_done = serializers.SerializerMethodField()
def get_date_done(self, obj):
result = ""
if hasattr(obj, "attempted_task") and obj.attempted_task:
result = obj.attempted_task.date_done
return result
task_id = serializers.SerializerMethodField()
def get_task_id(self, obj):
result = ""
if hasattr(obj, "attempted_task") and obj.attempted_task:
result = obj.attempted_task.task_id
return result
task_name = serializers.SerializerMethodField()
def get_task_name(self, obj):
result = ""
if hasattr(obj, "attempted_task") and obj.attempted_task:
result = obj.attempted_task.task_name
return result
name = serializers.SerializerMethodField()
def get_name(self, obj):
result = ""
if hasattr(obj, "attempted_task") and obj.attempted_task:
try:
task_kwargs: Optional[str] = obj.attempted_task.task_kwargs
# Try the override filename first (this is a webui created task?)
if task_kwargs is not None:
# It's a string, string of a dict. Who knows why...
kwargs = literal_eval(literal_eval(task_kwargs))
if "override_filename" in kwargs:
result = kwargs["override_filename"]
# Nothing was found, report the task first argument
if not len(result):
# There are always some arguments to the consume
task_args: Tuple = literal_eval(
literal_eval(obj.attempted_task.task_args),
)
filepath = Path(task_args[0])
result = filepath.name
except Exception as e: # pragma: no cover
# Extra security if something is malformed
logger.warning(f"Error getting file name from task: {e}", exc_info=True)
return result
related_document = serializers.SerializerMethodField()
def get_related_document(self, obj):
result = ""
regexp = r"New document id (\d+) created"
if (
hasattr(obj, "attempted_task")
and obj.attempted_task
and obj.attempted_task.result
and obj.attempted_task.status == states.SUCCESS
):
try:
result = re.search(regexp, obj.attempted_task.result).group(1)
except Exception:
pass
return result
class AcknowledgeTasksViewSerializer(serializers.Serializer):

View File

@@ -2,7 +2,6 @@ import logging
import os
import shutil
import django_q
from django.conf import settings
from django.contrib.admin.models import ADDITION
from django.contrib.admin.models import LogEntry
@@ -14,6 +13,7 @@ from django.db.models import Q
from django.dispatch import receiver
from django.utils import termcolors
from django.utils import timezone
from django_celery_results.models import TaskResult
from filelock import FileLock
from .. import matching
@@ -25,7 +25,6 @@ from ..models import MatchingModel
from ..models import PaperlessTask
from ..models import Tag
logger = logging.getLogger("paperless.handlers")
@@ -291,7 +290,7 @@ def set_storage_path(
)
+ f" [{document.pk}]",
)
print(f"Sugest storage directory {selected}")
print(f"Suggest storage directory {selected}")
else:
logger.info(
f"Assigning storage path {selected} to {document}",
@@ -503,34 +502,19 @@ def add_to_index(sender, document, **kwargs):
index.add_or_update_document(document)
@receiver(django_q.signals.pre_enqueue)
def init_paperless_task(sender, task, **kwargs):
if task["func"] == "documents.tasks.consume_file":
paperless_task, created = PaperlessTask.objects.get_or_create(
task_id=task["id"],
)
paperless_task.name = task["name"]
paperless_task.created = task["started"]
paperless_task.save()
@receiver(django_q.signals.pre_execute)
def paperless_task_started(sender, task, **kwargs):
@receiver(models.signals.post_save, sender=TaskResult)
def update_paperless_task(sender, instance: TaskResult, **kwargs):
try:
if task["func"] == "documents.tasks.consume_file":
paperless_task = PaperlessTask.objects.get(task_id=task["id"])
paperless_task.started = timezone.now()
paperless_task.save()
except PaperlessTask.DoesNotExist:
pass
@receiver(models.signals.post_save, sender=django_q.models.Task)
def update_paperless_task(sender, instance, **kwargs):
try:
if instance.func == "documents.tasks.consume_file":
paperless_task = PaperlessTask.objects.get(task_id=instance.id)
if instance.task_name == "documents.tasks.consume_file":
paperless_task, _ = PaperlessTask.objects.get_or_create(
task_id=instance.task_id,
)
paperless_task.name = instance.task_name
paperless_task.created = instance.date_created
paperless_task.completed = instance.date_done
paperless_task.attempted_task = instance
paperless_task.save()
except PaperlessTask.DoesNotExist:
pass
except Exception as e:
# Don't let an exception in the signal handlers prevent
# a document from being consumed.
logger.error(f"Creating PaperlessTask failed: {e}")

View File

@@ -1,14 +1,17 @@
import hashlib
import logging
import os
import shutil
import uuid
from pathlib import Path
from typing import Type
import tqdm
from asgiref.sync import async_to_sync
from celery import shared_task
from channels.layers import get_channel_layer
from django.conf import settings
from django.core.exceptions import ObjectDoesNotExist
from django.db import transaction
from django.db.models.signals import post_save
from documents import barcodes
from documents import index
@@ -17,6 +20,8 @@ from documents.classifier import DocumentClassifier
from documents.classifier import load_classifier
from documents.consumer import Consumer
from documents.consumer import ConsumerError
from documents.file_handling import create_source_path_directory
from documents.file_handling import generate_unique_filename
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
@@ -24,14 +29,16 @@ from documents.models import StoragePath
from documents.models import Tag
from documents.parsers import DocumentParser
from documents.parsers import get_parser_class_for_mime_type
from documents.parsers import ParseError
from documents.sanity_checker import SanityCheckFailedException
from filelock import FileLock
from redis.exceptions import ConnectionError
from whoosh.writing import AsyncWriter
logger = logging.getLogger("paperless.tasks")
@shared_task
def index_optimize():
ix = index.open_index()
writer = AsyncWriter(ix)
@@ -48,6 +55,7 @@ def index_reindex(progress_bar_disable=False):
index.update_document(writer, document)
@shared_task
def train_classifier():
if (
not Tag.objects.filter(matching_algorithm=Tag.MATCH_AUTO).exists()
@@ -76,6 +84,7 @@ def train_classifier():
logger.warning("Classifier error: " + str(e))
@shared_task
def consume_file(
path,
override_filename=None,
@@ -87,32 +96,18 @@ def consume_file(
override_created=None,
):
path = Path(path).resolve()
# check for separators in current document
if settings.CONSUMER_ENABLE_BARCODES:
mime_type = barcodes.get_file_mime_type(path)
pdf_filepath, separators = barcodes.scan_file_for_separating_barcodes(path)
if not barcodes.supported_file_type(mime_type):
# if not supported, skip this routine
logger.warning(
f"Unsupported file format for barcode reader: {str(mime_type)}",
if separators:
logger.debug(
f"Pages with separators found in: {str(path)}",
)
else:
separators = []
document_list = []
if mime_type == "image/tiff":
file_to_process = barcodes.convert_from_tiff_to_pdf(path)
else:
file_to_process = path
separators = barcodes.scan_file_for_separating_barcodes(file_to_process)
if separators:
logger.debug(
f"Pages with separators found in: {str(path)}",
)
document_list = barcodes.separate_pages(file_to_process, separators)
document_list = barcodes.separate_pages(pdf_filepath, separators)
if document_list:
for n, document in enumerate(document_list):
@@ -122,17 +117,31 @@ def consume_file(
newname = f"{str(n)}_" + override_filename
else:
newname = None
barcodes.save_to_dir(document, newname=newname)
# if we got here, the document was successfully split
# and can safely be deleted
if mime_type == "image/tiff":
# Remove the TIFF converted to PDF file
logger.debug(f"Deleting file {file_to_process}")
os.unlink(file_to_process)
# Remove the original file (new file is saved above)
logger.debug(f"Deleting file {path}")
os.unlink(path)
# If the file is an upload, it's in the scratch directory
# Move it to consume directory to be picked up
# Otherwise, use the current parent to keep possible tags
# from subdirectories
try:
# is_relative_to would be nicer, but new in 3.9
_ = path.relative_to(settings.SCRATCH_DIR)
save_to_dir = settings.CONSUMPTION_DIR
except ValueError:
save_to_dir = path.parent
barcodes.save_to_dir(
document,
newname=newname,
target_dir=save_to_dir,
)
# Delete the PDF file which was split
os.remove(pdf_filepath)
# If the original was a TIFF, remove the original file as well
if str(pdf_filepath) != str(path):
logger.debug(f"Deleting file {path}")
os.unlink(path)
# notify the sender, otherwise the progress bar
# in the UI stays stuck
@@ -149,11 +158,8 @@ def consume_file(
"status_updates",
{"type": "status_update", "data": payload},
)
except OSError as e:
logger.warning(
"OSError. It could be, the broker cannot be reached.",
)
logger.warning(str(e))
except ConnectionError as e:
logger.warning(f"ConnectionError on status send: {str(e)}")
# consuming stops here, since the original document with
# the barcodes has been split and will be consumed separately
return "File successfully split"
@@ -179,6 +185,7 @@ def consume_file(
)
@shared_task
def sanity_check():
messages = sanity_checker.check_sanity()
@@ -194,6 +201,7 @@ def sanity_check():
return "No issues detected."
@shared_task
def bulk_update_documents(document_ids):
documents = Document.objects.filter(id__in=document_ids)
@@ -207,44 +215,63 @@ def bulk_update_documents(document_ids):
index.update_document(writer, doc)
def redo_ocr(document_ids):
all_docs = Document.objects.all()
@shared_task
def update_document_archive_file(document_id):
"""
Re-creates the archive file of a document, including new OCR content and thumbnail
"""
document = Document.objects.get(id=document_id)
for doc_pk in document_ids:
try:
logger.info(f"Parsing document {doc_pk}")
doc: Document = all_docs.get(pk=doc_pk)
except ObjectDoesNotExist:
logger.error(f"Document {doc_pk} does not exist")
continue
mime_type = document.mime_type
# Get the correct parser for this mime type
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(
doc.mime_type,
parser_class: Type[DocumentParser] = get_parser_class_for_mime_type(mime_type)
if not parser_class:
logger.error(
f"No parser found for mime type {mime_type}, cannot "
f"archive document {document} (ID: {document_id})",
)
document_parser: DocumentParser = parser_class(
"redo-ocr",
return
parser: DocumentParser = parser_class(logging_group=uuid.uuid4())
try:
parser.parse(document.source_path, mime_type, document.get_public_filename())
thumbnail = parser.get_thumbnail(
document.source_path,
mime_type,
document.get_public_filename(),
)
# Create a file path to copy the original file to for working on
temp_file = (Path(document_parser.tempdir) / Path("new-ocr-file")).resolve()
if parser.get_archive_path():
with transaction.atomic():
with open(parser.get_archive_path(), "rb") as f:
checksum = hashlib.md5(f.read()).hexdigest()
# I'm going to save first so that in case the file move
# fails, the database is rolled back.
# We also don't use save() since that triggers the filehandling
# logic, and we don't want that yet (file not yet in place)
document.archive_filename = generate_unique_filename(
document,
archive_filename=True,
)
Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum,
content=parser.get_text(),
archive_filename=document.archive_filename,
)
with FileLock(settings.MEDIA_LOCK):
create_source_path_directory(document.archive_path)
shutil.move(parser.get_archive_path(), document.archive_path)
shutil.move(thumbnail, document.thumbnail_path)
shutil.copy(doc.source_path, temp_file)
with index.open_index_writer() as writer:
index.update_document(writer, document)
try:
logger.info(
f"Using {type(document_parser).__name__} for document",
)
# Try to re-parse the document into text
document_parser.parse(str(temp_file), doc.mime_type)
doc.content = document_parser.get_text()
doc.save()
logger.info("Document OCR updated")
except ParseError as e:
logger.error(f"Error parsing document: {e}")
finally:
# Remove the file path if it was created
if temp_file.exists() and temp_file.is_file():
temp_file.unlink()
except Exception:
logger.exception(
f"Error while parsing document {document} " f"(ID: {document_id})",
)
finally:
parser.cleanup()

View File

@@ -10,6 +10,8 @@ import zipfile
from unittest import mock
from unittest.mock import MagicMock
import celery
try:
import zoneinfo
except ImportError:
@@ -20,7 +22,6 @@ from django.conf import settings
from django.contrib.auth.models import User
from django.test import override_settings
from django.utils import timezone
from django_q.models import Task
from documents import bulk_edit
from documents import index
from documents.models import Correspondent
@@ -31,7 +32,8 @@ from documents.models import PaperlessTask
from documents.models import SavedView
from documents.models import StoragePath
from documents.models import Tag
from documents.models import UiSettings
from django_celery_results.models import TaskResult
from documents.models import Comment
from documents.models import StoragePath
from documents.tests.utils import DirectoriesMixin
from paperless import version
@@ -789,7 +791,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 200)
self.assertEqual(response.data["documents_inbox"], None)
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload(self, m):
with open(
@@ -812,7 +814,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertIsNone(kwargs["override_document_type_id"])
self.assertIsNone(kwargs["override_tag_ids"])
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_empty_metadata(self, m):
with open(
@@ -835,7 +837,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertIsNone(kwargs["override_document_type_id"])
self.assertIsNone(kwargs["override_tag_ids"])
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_invalid_form(self, m):
with open(
@@ -849,7 +851,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 400)
m.assert_not_called()
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_invalid_file(self, m):
with open(
@@ -863,7 +865,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 400)
m.assert_not_called()
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_title(self, async_task):
with open(
os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
@@ -881,7 +883,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(kwargs["override_title"], "my custom title")
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_correspondent(self, async_task):
c = Correspondent.objects.create(name="test-corres")
with open(
@@ -900,7 +902,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(kwargs["override_correspondent_id"], c.id)
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_invalid_correspondent(self, async_task):
with open(
os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
@@ -914,7 +916,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
async_task.assert_not_called()
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_document_type(self, async_task):
dt = DocumentType.objects.create(name="invoice")
with open(
@@ -933,7 +935,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertEqual(kwargs["override_document_type_id"], dt.id)
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_invalid_document_type(self, async_task):
with open(
os.path.join(os.path.dirname(__file__), "samples", "simple.pdf"),
@@ -947,7 +949,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
async_task.assert_not_called()
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_tags(self, async_task):
t1 = Tag.objects.create(name="tag1")
t2 = Tag.objects.create(name="tag2")
@@ -967,7 +969,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
self.assertCountEqual(kwargs["override_tag_ids"], [t1.id, t2.id])
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_invalid_tags(self, async_task):
t1 = Tag.objects.create(name="tag1")
t2 = Tag.objects.create(name="tag2")
@@ -983,7 +985,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
async_task.assert_not_called()
@mock.patch("documents.views.async_task")
@mock.patch("documents.views.consume_file.delay")
def test_upload_with_created(self, async_task):
created = datetime.datetime(
2022,
@@ -1107,6 +1109,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
"tags": [],
"document_types": [],
"storage_paths": [],
"dates": [],
},
)
@@ -1118,6 +1121,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
@mock.patch("documents.views.match_document_types")
@mock.patch("documents.views.match_tags")
@mock.patch("documents.views.match_correspondents")
@override_settings(NUMBER_OF_SUGGESTED_DATES=10)
def test_get_suggestions(
self,
match_correspondents,
@@ -1128,7 +1132,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
doc = Document.objects.create(
title="test",
mime_type="application/pdf",
content="this is an invoice!",
content="this is an invoice from 12.04.2022!",
)
match_correspondents.return_value = [Correspondent(id=88), Correspondent(id=2)]
@@ -1144,6 +1148,7 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
"tags": [56, 123],
"document_types": [23],
"storage_paths": [99, 77],
"dates": ["2022-04-12"],
},
)
@@ -1354,6 +1359,133 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
1,
)
def test_get_existing_comments(self):
"""
GIVEN:
- A document with a single comment
WHEN:
- API reuqest for document comments is made
THEN:
- The associated comment is returned
"""
doc = Document.objects.create(
title="test",
mime_type="application/pdf",
content="this is a document which will have comments!",
)
comment = Comment.objects.create(
comment="This is a comment.",
document=doc,
user=self.user,
)
response = self.client.get(
f"/api/documents/{doc.pk}/comments/",
format="json",
)
self.assertEqual(response.status_code, 200)
resp_data = response.json()
self.assertEqual(len(resp_data), 1)
resp_data = resp_data[0]
del resp_data["created"]
self.assertDictEqual(
resp_data,
{
"id": comment.id,
"comment": comment.comment,
"user": {
"id": comment.user.id,
"username": comment.user.username,
"firstname": comment.user.first_name,
"lastname": comment.user.last_name,
},
},
)
def test_create_comment(self):
"""
GIVEN:
- Existing document
WHEN:
- API request is made to add a comment
THEN:
- Comment is created and associated with document
"""
doc = Document.objects.create(
title="test",
mime_type="application/pdf",
content="this is a document which will have comments added",
)
resp = self.client.post(
f"/api/documents/{doc.pk}/comments/",
data={"comment": "this is a posted comment"},
)
self.assertEqual(resp.status_code, 200)
response = self.client.get(
f"/api/documents/{doc.pk}/comments/",
format="json",
)
self.assertEqual(response.status_code, 200)
resp_data = response.json()
self.assertEqual(len(resp_data), 1)
resp_data = resp_data[0]
self.assertEqual(resp_data["comment"], "this is a posted comment")
def test_delete_comment(self):
"""
GIVEN:
- Existing document
WHEN:
- API request is made to add a comment
THEN:
- Comment is created and associated with document
"""
doc = Document.objects.create(
title="test",
mime_type="application/pdf",
content="this is a document which will have comments!",
)
comment = Comment.objects.create(
comment="This is a comment.",
document=doc,
user=self.user,
)
response = self.client.delete(
f"/api/documents/{doc.pk}/comments/?id={comment.pk}",
format="json",
)
self.assertEqual(response.status_code, 200)
self.assertEqual(len(Comment.objects.all()), 0)
def test_get_comments_no_doc(self):
"""
GIVEN:
- A request to get comments from a non-existent document
WHEN:
- API request for document comments is made
THEN:
- HTTP 404 is returned
"""
response = self.client.get(
"/api/documents/500/comments/",
format="json",
)
self.assertEqual(response.status_code, 404)
class TestDocumentApiV2(DirectoriesMixin, APITestCase):
def setUp(self):
@@ -1450,7 +1582,11 @@ class TestApiUiSettings(DirectoriesMixin, APITestCase):
self.assertEqual(response.status_code, 200)
self.assertDictEqual(
response.data["settings"],
{},
{
"update_checking": {
"backend_setting": "default",
},
},
)
def test_api_set_ui_settings(self):
@@ -1484,7 +1620,7 @@ class TestBulkEdit(DirectoriesMixin, APITestCase):
user = User.objects.create_superuser(username="temp_admin")
self.client.force_authenticate(user=user)
patcher = mock.patch("documents.bulk_edit.async_task")
patcher = mock.patch("documents.bulk_edit.bulk_update_documents.delay")
self.async_task = patcher.start()
self.addCleanup(patcher.stop)
self.c1 = Correspondent.objects.create(name="c1")
@@ -2411,38 +2547,6 @@ class TestApiRemoteVersion(DirectoriesMixin, APITestCase):
def setUp(self):
super().setUp()
def test_remote_version_default(self):
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, 200)
self.assertDictEqual(
response.data,
{
"version": "0.0.0",
"update_available": False,
"feature_is_set": False,
},
)
@override_settings(
ENABLE_UPDATE_CHECK=False,
)
def test_remote_version_disabled(self):
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, 200)
self.assertDictEqual(
response.data,
{
"version": "0.0.0",
"update_available": False,
"feature_is_set": True,
},
)
@override_settings(
ENABLE_UPDATE_CHECK=True,
)
@mock.patch("urllib.request.urlopen")
def test_remote_version_enabled_no_update_prefix(self, urlopen_mock):
@@ -2460,13 +2564,9 @@ class TestApiRemoteVersion(DirectoriesMixin, APITestCase):
{
"version": "1.6.0",
"update_available": False,
"feature_is_set": True,
},
)
@override_settings(
ENABLE_UPDATE_CHECK=True,
)
@mock.patch("urllib.request.urlopen")
def test_remote_version_enabled_no_update_no_prefix(self, urlopen_mock):
@@ -2486,13 +2586,9 @@ class TestApiRemoteVersion(DirectoriesMixin, APITestCase):
{
"version": version.__full_version_str__,
"update_available": False,
"feature_is_set": True,
},
)
@override_settings(
ENABLE_UPDATE_CHECK=True,
)
@mock.patch("urllib.request.urlopen")
def test_remote_version_enabled_update(self, urlopen_mock):
@@ -2519,13 +2615,9 @@ class TestApiRemoteVersion(DirectoriesMixin, APITestCase):
{
"version": new_version_str,
"update_available": True,
"feature_is_set": True,
},
)
@override_settings(
ENABLE_UPDATE_CHECK=True,
)
@mock.patch("urllib.request.urlopen")
def test_remote_version_bad_json(self, urlopen_mock):
@@ -2543,13 +2635,9 @@ class TestApiRemoteVersion(DirectoriesMixin, APITestCase):
{
"version": "0.0.0",
"update_available": False,
"feature_is_set": True,
},
)
@override_settings(
ENABLE_UPDATE_CHECK=True,
)
@mock.patch("urllib.request.urlopen")
def test_remote_version_exception(self, urlopen_mock):
@@ -2567,7 +2655,6 @@ class TestApiRemoteVersion(DirectoriesMixin, APITestCase):
{
"version": "0.0.0",
"update_available": False,
"feature_is_set": True,
},
)
@@ -2652,7 +2739,7 @@ class TestApiStoragePaths(DirectoriesMixin, APITestCase):
class TestTasks(APITestCase):
ENDPOINT = "/api/tasks/"
ENDPOINT_ACKOWLEDGE = "/api/acknowledge_tasks/"
ENDPOINT_ACKNOWLEDGE = "/api/acknowledge_tasks/"
def setUp(self):
super().setUp()
@@ -2661,16 +2748,27 @@ class TestTasks(APITestCase):
self.client.force_authenticate(user=self.user)
def test_get_tasks(self):
task_id1 = str(uuid.uuid4())
PaperlessTask.objects.create(task_id=task_id1)
Task.objects.create(
id=task_id1,
started=timezone.now() - datetime.timedelta(seconds=30),
stopped=timezone.now(),
func="documents.tasks.consume_file",
"""
GIVEN:
- Attempted celery tasks
WHEN:
- API call is made to get tasks
THEN:
- Attempting and pending tasks are serialized and provided
"""
result1 = TaskResult.objects.create(
task_id=str(uuid.uuid4()),
task_name="documents.tasks.some_great_task",
status=celery.states.PENDING,
)
task_id2 = str(uuid.uuid4())
PaperlessTask.objects.create(task_id=task_id2)
PaperlessTask.objects.create(attempted_task=result1)
result2 = TaskResult.objects.create(
task_id=str(uuid.uuid4()),
task_name="documents.tasks.some_awesome_task",
status=celery.states.STARTED,
)
PaperlessTask.objects.create(attempted_task=result2)
response = self.client.get(self.ENDPOINT)
@@ -2678,25 +2776,155 @@ class TestTasks(APITestCase):
self.assertEqual(len(response.data), 2)
returned_task1 = response.data[1]
returned_task2 = response.data[0]
self.assertEqual(returned_task1["task_id"], task_id1)
self.assertEqual(returned_task1["status"], "complete")
self.assertIsNotNone(returned_task1["attempted_task"])
self.assertEqual(returned_task2["task_id"], task_id2)
self.assertEqual(returned_task2["status"], "queued")
self.assertIsNone(returned_task2["attempted_task"])
self.assertEqual(returned_task1["task_id"], result1.task_id)
self.assertEqual(returned_task1["status"], celery.states.PENDING)
self.assertEqual(returned_task1["task_name"], result1.task_name)
self.assertEqual(returned_task2["task_id"], result2.task_id)
self.assertEqual(returned_task2["status"], celery.states.STARTED)
self.assertEqual(returned_task2["task_name"], result2.task_name)
def test_acknowledge_tasks(self):
task_id = str(uuid.uuid4())
task = PaperlessTask.objects.create(task_id=task_id)
"""
GIVEN:
- Attempted celery tasks
WHEN:
- API call is made to get mark task as acknowledged
THEN:
- Task is marked as acknowledged
"""
result1 = TaskResult.objects.create(
task_id=str(uuid.uuid4()),
task_name="documents.tasks.some_task",
status=celery.states.PENDING,
)
task = PaperlessTask.objects.create(attempted_task=result1)
response = self.client.get(self.ENDPOINT)
self.assertEqual(len(response.data), 1)
response = self.client.post(
self.ENDPOINT_ACKOWLEDGE,
self.ENDPOINT_ACKNOWLEDGE,
{"tasks": [task.id]},
)
self.assertEqual(response.status_code, 200)
response = self.client.get(self.ENDPOINT)
self.assertEqual(len(response.data), 0)
def test_task_result_no_error(self):
"""
GIVEN:
- A celery task completed without error
WHEN:
- API call is made to get tasks
THEN:
- The returned data includes the task result
"""
result1 = TaskResult.objects.create(
task_id=str(uuid.uuid4()),
task_name="documents.tasks.some_task",
status=celery.states.SUCCESS,
result="Success. New document id 1 created",
)
_ = PaperlessTask.objects.create(attempted_task=result1)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 1)
returned_data = response.data[0]
self.assertEqual(returned_data["result"], "Success. New document id 1 created")
self.assertEqual(returned_data["related_document"], "1")
def test_task_result_with_error(self):
"""
GIVEN:
- A celery task completed with an exception
WHEN:
- API call is made to get tasks
THEN:
- The returned result is the exception info
"""
result1 = TaskResult.objects.create(
task_id=str(uuid.uuid4()),
task_name="documents.tasks.some_task",
status=celery.states.SUCCESS,
result={
"exc_type": "ConsumerError",
"exc_message": ["test.pdf: Not consuming test.pdf: It is a duplicate."],
"exc_module": "documents.consumer",
},
)
_ = PaperlessTask.objects.create(attempted_task=result1)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 1)
returned_data = response.data[0]
self.assertEqual(
returned_data["result"],
"test.pdf: Not consuming test.pdf: It is a duplicate.",
)
def test_task_name_webui(self):
"""
GIVEN:
- Attempted celery task
- Task was created through the webui
WHEN:
- API call is made to get tasks
THEN:
- Returned data include the filename
"""
result1 = TaskResult.objects.create(
task_id=str(uuid.uuid4()),
task_name="documents.tasks.some_task",
status=celery.states.SUCCESS,
task_args="\"('/tmp/paperless/paperless-upload-5iq7skzc',)\"",
task_kwargs="\"{'override_filename': 'test.pdf', 'override_title': None, 'override_correspondent_id': None, 'override_document_type_id': None, 'override_tag_ids': None, 'task_id': '466e8fe7-7193-4698-9fff-72f0340e2082', 'override_created': None}\"",
)
_ = PaperlessTask.objects.create(attempted_task=result1)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 1)
returned_data = response.data[0]
self.assertEqual(returned_data["name"], "test.pdf")
def test_task_name_consume_folder(self):
"""
GIVEN:
- Attempted celery task
- Task was created through the consume folder
WHEN:
- API call is made to get tasks
THEN:
- Returned data include the filename
"""
result1 = TaskResult.objects.create(
task_id=str(uuid.uuid4()),
task_name="documents.tasks.some_task",
status=celery.states.SUCCESS,
task_args="\"('/consume/anothertest.pdf',)\"",
task_kwargs="\"{'override_tag_ids': None}\"",
)
_ = PaperlessTask.objects.create(attempted_task=result1)
response = self.client.get(self.ENDPOINT)
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response.data), 1)
returned_data = response.data[0]
self.assertEqual(returned_data["name"], "anothertest.pdf")

View File

@@ -3,6 +3,7 @@ import shutil
import tempfile
from unittest import mock
import pikepdf
from django.conf import settings
from django.test import override_settings
from django.test import TestCase
@@ -13,22 +14,23 @@ from PIL import Image
class TestBarcode(DirectoriesMixin, TestCase):
SAMPLE_DIR = os.path.join(
os.path.dirname(__file__),
"samples",
)
BARCODE_SAMPLE_DIR = os.path.join(SAMPLE_DIR, "barcodes")
def test_barcode_reader(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"barcode-39-PATCHT.png",
)
test_file = os.path.join(self.BARCODE_SAMPLE_DIR, "barcode-39-PATCHT.png")
img = Image.open(test_file)
separator_barcode = str(settings.CONSUMER_BARCODE_STRING)
self.assertEqual(barcodes.barcode_reader(img), [separator_barcode])
def test_barcode_reader2(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t.pbm",
)
img = Image.open(test_file)
@@ -37,9 +39,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_distorsion(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-39-PATCHT-distorsion.png",
)
img = Image.open(test_file)
@@ -48,9 +48,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_distorsion2(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-39-PATCHT-distorsion2.png",
)
img = Image.open(test_file)
@@ -59,9 +57,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_unreadable(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-39-PATCHT-unreadable.png",
)
img = Image.open(test_file)
@@ -69,9 +65,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_qr(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"qr-code-PATCHT.png",
)
img = Image.open(test_file)
@@ -80,9 +74,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_128(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-128-PATCHT.png",
)
img = Image.open(test_file)
@@ -90,15 +82,13 @@ class TestBarcode(DirectoriesMixin, TestCase):
self.assertEqual(barcodes.barcode_reader(img), [separator_barcode])
def test_barcode_reader_no_barcode(self):
test_file = os.path.join(os.path.dirname(__file__), "samples", "simple.png")
test_file = os.path.join(self.SAMPLE_DIR, "simple.png")
img = Image.open(test_file)
self.assertEqual(barcodes.barcode_reader(img), [])
def test_barcode_reader_custom_separator(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-39-custom.png",
)
img = Image.open(test_file)
@@ -106,9 +96,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_custom_qr_separator(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-qr-custom.png",
)
img = Image.open(test_file)
@@ -116,9 +104,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_reader_custom_128_separator(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-128-custom.png",
)
img = Image.open(test_file)
@@ -126,19 +112,15 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_get_mime_type(self):
tiff_file = os.path.join(
os.path.dirname(__file__),
"samples",
self.SAMPLE_DIR,
"simple.tiff",
)
pdf_file = os.path.join(
os.path.dirname(__file__),
"samples",
self.SAMPLE_DIR,
"simple.pdf",
)
png_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-128-custom.png",
)
tiff_file_no_extension = os.path.join(settings.SCRATCH_DIR, "testfile1")
@@ -173,8 +155,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_convert_error_from_pdf_to_pdf(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
self.SAMPLE_DIR,
"simple.pdf",
)
dst = os.path.join(settings.SCRATCH_DIR, "simple.pdf")
@@ -183,117 +164,235 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_scan_file_for_separating_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [0])
def test_scan_file_for_separating_barcodes2(self):
test_file = os.path.join(os.path.dirname(__file__), "samples", "simple.pdf")
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [])
test_file = os.path.join(self.SAMPLE_DIR, "simple.pdf")
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [])
def test_scan_file_for_separating_barcodes3(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [1])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [1])
def test_scan_file_for_separating_barcodes4(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"several-patcht-codes.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [2, 5])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [2, 5])
def test_scan_file_for_separating_barcodes_upsidedown(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle_reverse.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [1])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [1])
def test_scan_file_for_separating_barcodes_pillow_transcode_error(self):
"""
GIVEN:
- A PDF containing an image which cannot be transcoded to a PIL image
WHEN:
- The image tries to be transcoded to a PIL image, but fails
THEN:
- The barcode reader is still called
"""
def _build_device_n_pdf(self, save_path: str):
# Based on the pikepdf tests
# https://github.com/pikepdf/pikepdf/blob/abb35ebe17d579d76abe08265e00cf8890a12a95/tests/test_image_access.py
pdf = pikepdf.new()
pdf.add_blank_page(page_size=(72, 72))
imobj = pikepdf.Stream(
pdf,
bytes(range(0, 256)),
BitsPerComponent=8,
ColorSpace=pikepdf.Array(
[
pikepdf.Name.DeviceN,
pikepdf.Array([pikepdf.Name.Black]),
pikepdf.Name.DeviceCMYK,
pikepdf.Stream(
pdf,
b"{0 0 0 4 -1 roll}", # Colorspace conversion function
FunctionType=4,
Domain=[0.0, 1.0],
Range=[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0],
),
],
),
Width=16,
Height=16,
Type=pikepdf.Name.XObject,
Subtype=pikepdf.Name.Image,
)
pim = pikepdf.PdfImage(imobj)
self.assertEqual(pim.mode, "DeviceN")
self.assertTrue(pim.is_device_n)
pdf.pages[0].Contents = pikepdf.Stream(pdf, b"72 0 0 72 0 0 cm /Im0 Do")
pdf.pages[0].Resources = pikepdf.Dictionary(
XObject=pikepdf.Dictionary(Im0=imobj),
)
pdf.save(save_path)
with tempfile.NamedTemporaryFile(suffix="pdf") as device_n_pdf:
# Build an offending file
_build_device_n_pdf(self, str(device_n_pdf.name))
with mock.patch("documents.barcodes.barcode_reader") as reader:
reader.return_value = list()
_, _ = barcodes.scan_file_for_separating_barcodes(
str(device_n_pdf.name),
)
reader.assert_called()
def test_scan_file_for_separating_barcodes_fax_decode(self):
"""
GIVEN:
- A PDF containing an image encoded as CCITT Group 4 encoding
WHEN:
- Barcode processing happens with the file
THEN:
- The barcode is still detected
"""
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
"barcode-fax-image.pdf",
)
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [1])
def test_scan_file_for_separating_qr_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-qr.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [0])
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
def test_scan_file_for_separating_custom_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-39-custom.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [0])
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
def test_scan_file_for_separating_custom_qr_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-qr-custom.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [0])
@override_settings(CONSUMER_BARCODE_STRING="CUSTOM BARCODE")
def test_scan_file_for_separating_custom_128_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-128-custom.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [0])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [0])
def test_scan_file_for_separating_wrong_qr_barcodes(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"barcode-39-custom.pdf",
)
pages = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertEqual(pages, [])
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(pdf_file, test_file)
self.assertListEqual(separator_page_numbers, [])
def test_separate_pages(self):
test_file = os.path.join(
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
pages = barcodes.separate_pages(test_file, [1])
self.assertEqual(len(pages), 2)
def test_separate_pages_double_code(self):
"""
GIVEN:
- Input PDF with two patch code pages in a row
WHEN:
- The input file is split
THEN:
- Only two files are output
"""
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
"patch-code-t-middle.pdf",
"patch-code-t-double.pdf",
)
pages = barcodes.separate_pages(test_file, [1])
pages = barcodes.separate_pages(test_file, [1, 2])
self.assertEqual(len(pages), 2)
def test_separate_pages_no_list(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
@@ -308,9 +407,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_save_to_dir(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t.pdf",
)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
@@ -320,9 +417,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_save_to_dir2(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t.pdf",
)
nonexistingdir = "/nowhere"
@@ -340,9 +435,7 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_save_to_dir3(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t.pdf",
)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
@@ -352,35 +445,41 @@ class TestBarcode(DirectoriesMixin, TestCase):
def test_barcode_splitter(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
tempdir = tempfile.mkdtemp(prefix="paperless-", dir=settings.SCRATCH_DIR)
separators = barcodes.scan_file_for_separating_barcodes(test_file)
self.assertTrue(separators)
document_list = barcodes.separate_pages(test_file, separators)
pdf_file, separator_page_numbers = barcodes.scan_file_for_separating_barcodes(
test_file,
)
self.assertEqual(test_file, pdf_file)
self.assertTrue(len(separator_page_numbers) > 0)
document_list = barcodes.separate_pages(test_file, separator_page_numbers)
self.assertTrue(document_list)
for document in document_list:
barcodes.save_to_dir(document, target_dir=tempdir)
target_file1 = os.path.join(tempdir, "patch-code-t-middle_document_0.pdf")
target_file2 = os.path.join(tempdir, "patch-code-t-middle_document_1.pdf")
self.assertTrue(os.path.isfile(target_file1))
self.assertTrue(os.path.isfile(target_file2))
@override_settings(CONSUMER_ENABLE_BARCODES=True)
def test_consume_barcode_file(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.pdf",
)
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.pdf")
shutil.copy(test_file, dst)
self.assertEqual(tasks.consume_file(dst), "File successfully split")
with mock.patch("documents.tasks.async_to_sync"):
self.assertEqual(tasks.consume_file(dst), "File successfully split")
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
@@ -388,15 +487,14 @@ class TestBarcode(DirectoriesMixin, TestCase):
)
def test_consume_barcode_tiff_file(self):
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.tiff",
)
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle.tiff")
shutil.copy(test_file, dst)
self.assertEqual(tasks.consume_file(dst), "File successfully split")
with mock.patch("documents.tasks.async_to_sync"):
self.assertEqual(tasks.consume_file(dst), "File successfully split")
@override_settings(
CONSUMER_ENABLE_BARCODES=True,
@@ -412,18 +510,17 @@ class TestBarcode(DirectoriesMixin, TestCase):
and continue archiving the file as is.
"""
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
self.SAMPLE_DIR,
"simple.jpg",
)
dst = os.path.join(settings.SCRATCH_DIR, "simple.jpg")
shutil.copy(test_file, dst)
with self.assertLogs("paperless.tasks", level="WARNING") as cm:
with self.assertLogs("paperless.barcodes", level="WARNING") as cm:
self.assertIn("Success", tasks.consume_file(dst))
self.assertListEqual(
cm.output,
[
"WARNING:paperless.tasks:Unsupported file format for barcode reader: image/jpeg",
"WARNING:paperless.barcodes:Unsupported file format for barcode reader: image/jpeg",
],
)
m.assert_called_once()
@@ -445,12 +542,11 @@ class TestBarcode(DirectoriesMixin, TestCase):
the user uploads a supported image file, but without extension
"""
test_file = os.path.join(
os.path.dirname(__file__),
"samples",
"barcodes",
self.BARCODE_SAMPLE_DIR,
"patch-code-t-middle.tiff",
)
dst = os.path.join(settings.SCRATCH_DIR, "patch-code-t-middle")
shutil.copy(test_file, dst)
self.assertEqual(tasks.consume_file(dst), "File successfully split")
with mock.patch("documents.tasks.async_to_sync"):
self.assertEqual(tasks.consume_file(dst), "File successfully split")

View File

@@ -1,9 +1,9 @@
import os
import re
import tempfile
from pathlib import Path
from unittest import mock
import documents
import pytest
from django.conf import settings
from django.test import override_settings
@@ -20,10 +20,19 @@ from documents.models import Tag
from documents.tests.utils import DirectoriesMixin
def dummy_preprocess(content: str):
content = content.lower().strip()
content = re.sub(r"\s+", " ", content)
return content
class TestClassifier(DirectoriesMixin, TestCase):
def setUp(self):
super().setUp()
self.classifier = DocumentClassifier()
self.classifier.preprocess_content = mock.MagicMock(
side_effect=dummy_preprocess,
)
def generate_test_data(self):
self.c1 = Correspondent.objects.create(
@@ -192,6 +201,8 @@ class TestClassifier(DirectoriesMixin, TestCase):
new_classifier = DocumentClassifier()
new_classifier.load()
new_classifier.preprocess_content = mock.MagicMock(side_effect=dummy_preprocess)
self.assertFalse(new_classifier.train())
# @override_settings(
@@ -215,6 +226,7 @@ class TestClassifier(DirectoriesMixin, TestCase):
new_classifier = DocumentClassifier()
new_classifier.load()
new_classifier.preprocess_content = mock.MagicMock(side_effect=dummy_preprocess)
self.assertCountEqual(new_classifier.predict_tags(self.doc2.content), [45, 12])

View File

@@ -8,6 +8,7 @@ from django.conf import settings
from django.test import override_settings
from django.test import TestCase
from documents.parsers import parse_date
from documents.parsers import parse_date_generator
from paperless.settings import DATE_ORDER
@@ -161,6 +162,25 @@ class TestDate(TestCase):
def test_crazy_date_with_spaces(self, *args):
self.assertIsNone(parse_date("", "20 408000l 2475"))
def test_multiple_dates(self):
text = """This text has multiple dates.
For example 02.02.2018, 22 July 2022 and Dezember 2021.
But not 24-12-9999 because its in the future..."""
dates = list(parse_date_generator("", text))
self.assertEqual(len(dates), 3)
self.assertEqual(
dates[0],
datetime.datetime(2018, 2, 2, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
self.assertEqual(
dates[1],
datetime.datetime(2022, 7, 22, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
self.assertEqual(
dates[2],
datetime.datetime(2021, 12, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(FILENAME_DATE_ORDER="YMD")
def test_filename_date_parse_valid_ymd(self, *args):
"""

View File

@@ -10,8 +10,8 @@ from django.core.management import call_command
from django.test import override_settings
from django.test import TestCase
from documents.file_handling import generate_filename
from documents.management.commands.document_archiver import handle_document
from documents.models import Document
from documents.tasks import update_document_archive_file
from documents.tests.utils import DirectoriesMixin
@@ -46,7 +46,7 @@ class TestArchiver(DirectoriesMixin, TestCase):
os.path.join(self.dirs.originals_dir, f"{doc.id:07}.pdf"),
)
handle_document(doc.pk)
update_document_archive_file(doc.pk)
doc = Document.objects.get(id=doc.id)
@@ -63,7 +63,7 @@ class TestArchiver(DirectoriesMixin, TestCase):
doc.save()
shutil.copy(sample_file, doc.source_path)
handle_document(doc.pk)
update_document_archive_file(doc.pk)
doc = Document.objects.get(id=doc.id)
@@ -94,8 +94,8 @@ class TestArchiver(DirectoriesMixin, TestCase):
os.path.join(self.dirs.originals_dir, f"document_01.pdf"),
)
handle_document(doc2.pk)
handle_document(doc1.pk)
update_document_archive_file(doc2.pk)
update_document_archive_file(doc1.pk)
doc1 = Document.objects.get(id=doc1.id)
doc2 = Document.objects.get(id=doc2.id)

View File

@@ -20,13 +20,14 @@ class ConsumerThread(Thread):
def __init__(self):
super().__init__()
self.cmd = document_consumer.Command()
self.cmd.stop_flag.clear()
def run(self) -> None:
self.cmd.handle(directory=settings.CONSUMPTION_DIR, oneshot=False)
self.cmd.handle(directory=settings.CONSUMPTION_DIR, oneshot=False, testing=True)
def stop(self):
# Consumer checks this every second.
self.cmd.stop_flag = True
self.cmd.stop_flag.set()
def chunked(size, source):
@@ -42,7 +43,7 @@ class ConsumerMixin:
super().setUp()
self.t = None
patcher = mock.patch(
"documents.management.commands.document_consumer.async_task",
"documents.tasks.consume_file.delay",
)
self.task_mock = patcher.start()
self.addCleanup(patcher.stop)
@@ -59,13 +60,14 @@ class ConsumerMixin:
self.t.stop()
# wait for the consumer to exit.
self.t.join()
self.t = None
super().tearDown()
def wait_for_task_mock_call(self, excpeted_call_count=1):
def wait_for_task_mock_call(self, expected_call_count=1):
n = 0
while n < 100:
if self.task_mock.call_count >= excpeted_call_count:
while n < 50:
if self.task_mock.call_count >= expected_call_count:
# give task_mock some time to finish and raise errors
sleep(1)
return
@@ -74,7 +76,7 @@ class ConsumerMixin:
# A bogus async_task that will simply check the file for
# completeness and raise an exception otherwise.
def bogus_task(self, func, filename, **kwargs):
def bogus_task(self, filename, **kwargs):
eq = filecmp.cmp(filename, self.sample_file, shallow=False)
if not eq:
print("Consumed an INVALID file.")
@@ -113,7 +115,7 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
self.task_mock.assert_called_once()
args, kwargs = self.task_mock.call_args
self.assertEqual(args[1], f)
self.assertEqual(args[0], f)
def test_consume_file_invalid_ext(self):
self.t_start()
@@ -133,7 +135,7 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
self.task_mock.assert_called_once()
args, kwargs = self.task_mock.call_args
self.assertEqual(args[1], f)
self.assertEqual(args[0], f)
@mock.patch("documents.management.commands.document_consumer.logger.error")
def test_slow_write_pdf(self, error_logger):
@@ -153,7 +155,7 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
self.task_mock.assert_called_once()
args, kwargs = self.task_mock.call_args
self.assertEqual(args[1], fname)
self.assertEqual(args[0], fname)
@mock.patch("documents.management.commands.document_consumer.logger.error")
def test_slow_write_and_move(self, error_logger):
@@ -173,7 +175,7 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
self.task_mock.assert_called_once()
args, kwargs = self.task_mock.call_args
self.assertEqual(args[1], fname2)
self.assertEqual(args[0], fname2)
error_logger.assert_not_called()
@@ -191,7 +193,7 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
self.task_mock.assert_called_once()
args, kwargs = self.task_mock.call_args
self.assertEqual(args[1], fname)
self.assertEqual(args[0], fname)
# assert that we have an error logged with this invalid file.
error_logger.assert_called_once()
@@ -234,12 +236,12 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
sleep(5)
self.wait_for_task_mock_call(excpeted_call_count=2)
self.wait_for_task_mock_call(expected_call_count=2)
self.assertEqual(2, self.task_mock.call_count)
fnames = [
os.path.basename(args[1]) for args, _ in self.task_mock.call_args_list
os.path.basename(args[0]) for args, _ in self.task_mock.call_args_list
]
self.assertCountEqual(fnames, ["my_file.pdf", "my_second_file.pdf"])
@@ -281,6 +283,8 @@ class TestConsumer(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
@override_settings(
CONSUMER_POLLING=1,
# please leave the delay here and down below
# see https://github.com/paperless-ngx/paperless-ngx/pull/66
CONSUMER_POLLING_DELAY=3,
CONSUMER_POLLING_RETRY_COUNT=20,
)
@@ -307,8 +311,7 @@ class TestConsumerRecursivePolling(TestConsumer):
class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
@override_settings(CONSUMER_RECURSIVE=True)
@override_settings(CONSUMER_SUBDIRS_AS_TAGS=True)
@override_settings(CONSUMER_RECURSIVE=True, CONSUMER_SUBDIRS_AS_TAGS=True)
def test_consume_file_with_path_tags(self):
tag_names = ("existingTag", "Space Tag")
@@ -335,7 +338,7 @@ class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
tag_ids.append(Tag.objects.get(name=tag_names[1]).pk)
args, kwargs = self.task_mock.call_args
self.assertEqual(args[1], f)
self.assertEqual(args[0], f)
# assertCountEqual has a bad name, but test that the first
# sequence contains the same elements as second, regardless of
@@ -344,7 +347,7 @@ class TestConsumerTags(DirectoriesMixin, ConsumerMixin, TransactionTestCase):
@override_settings(
CONSUMER_POLLING=1,
CONSUMER_POLLING_DELAY=1,
CONSUMER_POLLING_DELAY=3,
CONSUMER_POLLING_RETRY_COUNT=20,
)
def test_consume_file_with_path_tags_polling(self):

View File

@@ -10,10 +10,13 @@ from django.core.management import call_command
from django.test import override_settings
from django.test import TestCase
from documents.management.commands import document_exporter
from documents.models import Comment
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import StoragePath
from documents.models import Tag
from documents.models import User
from documents.sanity_checker import check_sanity
from documents.settings import EXPORTER_FILE_NAME
from documents.tests.utils import DirectoriesMixin
@@ -25,6 +28,8 @@ class TestExportImport(DirectoriesMixin, TestCase):
self.target = tempfile.mkdtemp()
self.addCleanup(shutil.rmtree, self.target)
self.user = User.objects.create(username="temp_admin")
self.d1 = Document.objects.create(
content="Content",
checksum="42995833e01aea9b3edee44bbfdd7ce1",
@@ -57,14 +62,23 @@ class TestExportImport(DirectoriesMixin, TestCase):
storage_type=Document.STORAGE_TYPE_GPG,
)
self.comment = Comment.objects.create(
comment="This is a comment. amaze.",
document=self.d1,
user=self.user,
)
self.t1 = Tag.objects.create(name="t")
self.dt1 = DocumentType.objects.create(name="dt")
self.c1 = Correspondent.objects.create(name="c")
self.sp1 = StoragePath.objects.create(path="{created_year}-{title}")
self.d1.tags.add(self.t1)
self.d1.correspondent = self.c1
self.d1.document_type = self.dt1
self.d1.save()
self.d4.storage_path = self.sp1
self.d4.save()
super().setUp()
def _get_document_from_manifest(self, manifest, id):
@@ -110,7 +124,7 @@ class TestExportImport(DirectoriesMixin, TestCase):
manifest = self._do_export(use_filename_format=use_filename_format)
self.assertEqual(len(manifest), 8)
self.assertEqual(len(manifest), 11)
self.assertEqual(
len(list(filter(lambda e: e["model"] == "documents.document", manifest))),
4,
@@ -171,6 +185,11 @@ class TestExportImport(DirectoriesMixin, TestCase):
checksum = hashlib.md5(f.read()).hexdigest()
self.assertEqual(checksum, element["fields"]["archive_checksum"])
elif element["model"] == "documents.comment":
self.assertEqual(element["fields"]["comment"], self.comment.comment)
self.assertEqual(element["fields"]["document"], self.d1.id)
self.assertEqual(element["fields"]["user"], self.user.id)
with paperless_environment() as dirs:
self.assertEqual(Document.objects.count(), 4)
Document.objects.all().delete()
@@ -184,6 +203,7 @@ class TestExportImport(DirectoriesMixin, TestCase):
self.assertEqual(Tag.objects.count(), 1)
self.assertEqual(Correspondent.objects.count(), 1)
self.assertEqual(DocumentType.objects.count(), 1)
self.assertEqual(StoragePath.objects.count(), 1)
self.assertEqual(Document.objects.get(id=self.d1.id).title, "wow1")
self.assertEqual(Document.objects.get(id=self.d2.id).title, "wow2")
self.assertEqual(Document.objects.get(id=self.d3.id).title, "wow2")

View File

@@ -3,12 +3,34 @@ from django.test import TestCase
from documents.models import Correspondent
from documents.models import Document
from documents.models import DocumentType
from documents.models import StoragePath
from documents.models import Tag
from documents.tests.utils import DirectoriesMixin
class TestRetagger(DirectoriesMixin, TestCase):
def make_models(self):
self.sp1 = StoragePath.objects.create(
name="dummy a",
path="{created_data}/{title}",
match="auto document",
matching_algorithm=StoragePath.MATCH_LITERAL,
)
self.sp2 = StoragePath.objects.create(
name="dummy b",
path="{title}",
match="^first|^unrelated",
matching_algorithm=StoragePath.MATCH_REGEX,
)
self.sp3 = StoragePath.objects.create(
name="dummy c",
path="{title}",
match="^blah",
matching_algorithm=StoragePath.MATCH_REGEX,
)
self.d1 = Document.objects.create(
checksum="A",
title="A",
@@ -23,6 +45,7 @@ class TestRetagger(DirectoriesMixin, TestCase):
checksum="C",
title="C",
content="unrelated document",
storage_path=self.sp3,
)
self.d4 = Document.objects.create(
checksum="D",
@@ -146,15 +169,15 @@ class TestRetagger(DirectoriesMixin, TestCase):
call_command("document_retagger", "--document_type", "--suggest")
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.document_type, None)
self.assertEqual(d_second.document_type, None)
self.assertIsNone(d_first.document_type)
self.assertIsNone(d_second.document_type)
def test_add_correspondent_suggest(self):
call_command("document_retagger", "--correspondent", "--suggest")
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.correspondent, None)
self.assertEqual(d_second.correspondent, None)
self.assertIsNone(d_first.correspondent)
self.assertIsNone(d_second.correspondent)
def test_add_tags_suggest_url(self):
call_command(
@@ -178,8 +201,8 @@ class TestRetagger(DirectoriesMixin, TestCase):
)
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.document_type, None)
self.assertEqual(d_second.document_type, None)
self.assertIsNone(d_first.document_type)
self.assertIsNone(d_second.document_type)
def test_add_correspondent_suggest_url(self):
call_command(
@@ -190,5 +213,48 @@ class TestRetagger(DirectoriesMixin, TestCase):
)
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.correspondent, None)
self.assertEqual(d_second.correspondent, None)
self.assertIsNone(d_first.correspondent)
self.assertIsNone(d_second.correspondent)
def test_add_storage_path(self):
"""
GIVEN:
- 2 storage paths with documents which match them
- 1 document which matches but has a storage path
WHEN:
- document retagger is called
THEN:
- Matching document's storage paths updated
- Non-matching documents have no storage path
- Existing storage patch left unchanged
"""
call_command(
"document_retagger",
"--storage_path",
)
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.storage_path, self.sp2)
self.assertEqual(d_auto.storage_path, self.sp1)
self.assertIsNone(d_second.storage_path)
self.assertEqual(d_unrelated.storage_path, self.sp3)
def test_overwrite_storage_path(self):
"""
GIVEN:
- 2 storage paths with documents which match them
- 1 document which matches but has a storage path
WHEN:
- document retagger is called with overwrite
THEN:
- Matching document's storage paths updated
- Non-matching documents have no storage path
- Existing storage patch overwritten
"""
call_command("document_retagger", "--storage_path", "--overwrite")
d_first, d_second, d_unrelated, d_auto = self.get_updated_docs()
self.assertEqual(d_first.storage_path, self.sp2)
self.assertEqual(d_auto.storage_path, self.sp1)
self.assertIsNone(d_second.storage_path)
self.assertEqual(d_unrelated.storage_path, self.sp2)

View File

@@ -1,35 +0,0 @@
import logging
from unittest import mock
from django.test import TestCase
from paperless.settings import default_task_workers
from paperless.settings import default_threads_per_worker
class TestSettings(TestCase):
@mock.patch("paperless.settings.multiprocessing.cpu_count")
def test_single_core(self, cpu_count):
cpu_count.return_value = 1
default_workers = default_task_workers()
default_threads = default_threads_per_worker(default_workers)
self.assertEqual(default_workers, 1)
self.assertEqual(default_threads, 1)
def test_workers_threads(self):
for i in range(1, 64):
with mock.patch(
"paperless.settings.multiprocessing.cpu_count",
) as cpu_count:
cpu_count.return_value = i
default_workers = default_task_workers()
default_threads = default_threads_per_worker(default_workers)
self.assertTrue(default_workers >= 1)
self.assertTrue(default_threads >= 1)
self.assertTrue(default_workers * default_threads <= i, f"{i}")

View File

@@ -11,6 +11,7 @@ from documents.models import DocumentType
from documents.models import Tag
from documents.sanity_checker import SanityCheckFailedException
from documents.sanity_checker import SanityCheckMessages
from documents.tests.test_classifier import dummy_preprocess
from documents.tests.utils import DirectoriesMixin
@@ -75,21 +76,26 @@ class TestClassifier(DirectoriesMixin, TestCase):
doc = Document.objects.create(correspondent=c, content="test", title="test")
self.assertFalse(os.path.isfile(settings.MODEL_FILE))
tasks.train_classifier()
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
mtime = os.stat(settings.MODEL_FILE).st_mtime
with mock.patch(
"documents.classifier.DocumentClassifier.preprocess_content",
) as pre_proc_mock:
pre_proc_mock.side_effect = dummy_preprocess
tasks.train_classifier()
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
mtime2 = os.stat(settings.MODEL_FILE).st_mtime
self.assertEqual(mtime, mtime2)
tasks.train_classifier()
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
mtime = os.stat(settings.MODEL_FILE).st_mtime
doc.content = "test2"
doc.save()
tasks.train_classifier()
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
mtime3 = os.stat(settings.MODEL_FILE).st_mtime
self.assertNotEqual(mtime2, mtime3)
tasks.train_classifier()
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
mtime2 = os.stat(settings.MODEL_FILE).st_mtime
self.assertEqual(mtime, mtime2)
doc.content = "test2"
doc.save()
tasks.train_classifier()
self.assertTrue(os.path.isfile(settings.MODEL_FILE))
mtime3 = os.stat(settings.MODEL_FILE).st_mtime
self.assertNotEqual(mtime2, mtime3)
class TestSanityCheck(DirectoriesMixin, TestCase):

View File

@@ -1,3 +1,4 @@
import itertools
import json
import logging
import os
@@ -21,12 +22,13 @@ from django.db.models.functions import Lower
from django.http import Http404
from django.http import HttpResponse
from django.http import HttpResponseBadRequest
from django.shortcuts import get_object_or_404
from django.utils.decorators import method_decorator
from django.utils.translation import get_language
from django.views.decorators.cache import cache_control
from django.views.generic import TemplateView
from django_filters.rest_framework import DjangoFilterBackend
from django_q.tasks import async_task
from documents.tasks import consume_file
from packaging import version as packaging_version
from paperless import version
from paperless.db import GnuPG
@@ -62,6 +64,7 @@ from .matching import match_correspondents
from .matching import match_document_types
from .matching import match_storage_paths
from .matching import match_tags
from .models import Comment
from .models import Correspondent
from .models import Document
from .models import DocumentType
@@ -70,6 +73,7 @@ from .models import SavedView
from .models import StoragePath
from .models import Tag
from .parsers import get_parser_class_for_mime_type
from .parsers import parse_date_generator
from .serialisers import AcknowledgeTasksViewSerializer
from .serialisers import BulkDownloadSerializer
from .serialisers import BulkEditSerializer
@@ -257,6 +261,9 @@ class DocumentViewSet(
file_handle = doc.source_file
filename = doc.get_public_filename()
mime_type = doc.mime_type
# Support browser previewing csv files by using text mime type
if mime_type in {"application/csv", "text/csv"} and disposition == "inline":
mime_type = "text/plain"
if doc.storage_type == Document.STORAGE_TYPE_GPG:
file_handle = GnuPG.decrypted(file_handle)
@@ -313,6 +320,7 @@ class DocumentViewSet(
"original_metadata": self.get_metadata(doc.source_path, doc.mime_type),
"archive_checksum": doc.archive_checksum,
"archive_media_filename": doc.archive_filename,
"original_filename": doc.original_filename,
}
if doc.has_archive_version:
@@ -329,13 +337,15 @@ class DocumentViewSet(
@action(methods=["get"], detail=True)
def suggestions(self, request, pk=None):
try:
doc = Document.objects.get(pk=pk)
except Document.DoesNotExist:
raise Http404()
doc = get_object_or_404(Document, pk=pk)
classifier = load_classifier()
gen = parse_date_generator(doc.filename, doc.content)
dates = sorted(
{i for i in itertools.islice(gen, settings.NUMBER_OF_SUGGESTED_DATES)},
)
return Response(
{
"correspondents": [c.id for c in match_correspondents(doc, classifier)],
@@ -344,6 +354,9 @@ class DocumentViewSet(
dt.id for dt in match_document_types(doc, classifier)
],
"storage_paths": [dt.id for dt in match_storage_paths(doc, classifier)],
"dates": [
date.strftime("%Y-%m-%d") for date in dates if date is not None
],
},
)
@@ -378,6 +391,67 @@ class DocumentViewSet(
except (FileNotFoundError, Document.DoesNotExist):
raise Http404()
def getComments(self, doc):
return [
{
"id": c.id,
"comment": c.comment,
"created": c.created,
"user": {
"id": c.user.id,
"username": c.user.username,
"firstname": c.user.first_name,
"lastname": c.user.last_name,
},
}
for c in Comment.objects.filter(document=doc).order_by("-created")
]
@action(methods=["get", "post", "delete"], detail=True)
def comments(self, request, pk=None):
try:
doc = Document.objects.get(pk=pk)
except Document.DoesNotExist:
raise Http404()
currentUser = request.user
if request.method == "GET":
try:
return Response(self.getComments(doc))
except Exception as e:
logger.warning(f"An error occurred retrieving comments: {str(e)}")
return Response(
{"error": "Error retreiving comments, check logs for more detail."},
)
elif request.method == "POST":
try:
c = Comment.objects.create(
document=doc,
comment=request.data["comment"],
user=currentUser,
)
c.save()
return Response(self.getComments(doc))
except Exception as e:
logger.warning(f"An error occurred saving comment: {str(e)}")
return Response(
{
"error": "Error saving comment, check logs for more detail.",
},
)
elif request.method == "DELETE":
comment = Comment.objects.get(id=int(request.GET.get("id")))
comment.delete()
return Response(self.getComments(doc))
return Response(
{
"error": "error",
},
)
class SearchResultSerializer(DocumentSerializer):
def to_representation(self, instance):
@@ -541,8 +615,7 @@ class PostDocumentView(GenericAPIView):
task_id = str(uuid.uuid4())
async_task(
"documents.tasks.consume_file",
consume_file.delay(
temp_filename,
override_filename=doc_name,
override_title=title,
@@ -550,7 +623,6 @@ class PostDocumentView(GenericAPIView):
override_document_type_id=document_type_id,
override_tag_ids=tag_ids,
task_id=task_id,
task_name=os.path.basename(doc_name)[:100],
override_created=created,
)
@@ -709,42 +781,38 @@ class RemoteVersionView(GenericAPIView):
remote_version = "0.0.0"
is_greater_than_current = False
current_version = packaging_version.parse(version.__full_version_str__)
# TODO: this can likely be removed when frontend settings are saved to DB
feature_is_set = settings.ENABLE_UPDATE_CHECK != "default"
if feature_is_set and settings.ENABLE_UPDATE_CHECK:
try:
req = urllib.request.Request(
"https://api.github.com/repos/paperless-ngx/"
"paperless-ngx/releases/latest",
)
# Ensure a JSON response
req.add_header("Accept", "application/json")
with urllib.request.urlopen(req) as response:
remote = response.read().decode("utf-8")
try:
remote_json = json.loads(remote)
remote_version = remote_json["tag_name"]
# Basically PEP 616 but that only went in 3.9
if remote_version.startswith("ngx-"):
remote_version = remote_version[len("ngx-") :]
except ValueError:
logger.debug("An error occurred parsing remote version json")
except urllib.error.URLError:
logger.debug("An error occurred checking for available updates")
is_greater_than_current = (
packaging_version.parse(
remote_version,
)
> current_version
try:
req = urllib.request.Request(
"https://api.github.com/repos/paperless-ngx/"
"paperless-ngx/releases/latest",
)
# Ensure a JSON response
req.add_header("Accept", "application/json")
with urllib.request.urlopen(req) as response:
remote = response.read().decode("utf-8")
try:
remote_json = json.loads(remote)
remote_version = remote_json["tag_name"]
# Basically PEP 616 but that only went in 3.9
if remote_version.startswith("ngx-"):
remote_version = remote_version[len("ngx-") :]
except ValueError:
logger.debug("An error occurred parsing remote version json")
except urllib.error.URLError:
logger.debug("An error occurred checking for available updates")
is_greater_than_current = (
packaging_version.parse(
remote_version,
)
> current_version
)
return Response(
{
"version": remote_version,
"update_available": is_greater_than_current,
"feature_is_set": feature_is_set,
},
)
@@ -777,15 +845,23 @@ class UiSettingsView(GenericAPIView):
displayname = user.username
if user.first_name or user.last_name:
displayname = " ".join([user.first_name, user.last_name])
settings = {}
ui_settings = {}
if hasattr(user, "ui_settings"):
settings = user.ui_settings.settings
ui_settings = user.ui_settings.settings
if "update_checking" in ui_settings:
ui_settings["update_checking"][
"backend_setting"
] = settings.ENABLE_UPDATE_CHECK
else:
ui_settings["update_checking"] = {
"backend_setting": settings.ENABLE_UPDATE_CHECK,
}
return Response(
{
"user_id": user.id,
"username": user.username,
"display_name": displayname,
"settings": settings,
"settings": ui_settings,
},
)
@@ -810,8 +886,9 @@ class TasksViewSet(ReadOnlyModelViewSet):
queryset = (
PaperlessTask.objects.filter(
acknowledged=False,
attempted_task__isnull=False,
)
.order_by("created")
.order_by("attempted_task__date_created")
.reverse()
)

View File

@@ -5,15 +5,15 @@ msgstr ""
"POT-Creation-Date: 2022-07-08 14:11-0700\n"
"PO-Revision-Date: 2022-07-08 22:07\n"
"Last-Translator: \n"
"Language-Team: Arabic, Saudi Arabia\n"
"Language: ar_SA\n"
"Language-Team: Arabic, Arabic\n"
"Language: ar_AR\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=UTF-8\n"
"Content-Transfer-Encoding: 8bit\n"
"Plural-Forms: nplurals=6; plural=(n==0 ? 0 : n==1 ? 1 : n==2 ? 2 : n%100>=3 && n%100<=10 ? 3 : n%100>=11 && n%100<=99 ? 4 : 5);\n"
"X-Crowdin-Project: paperless-ngx\n"
"X-Crowdin-Project-ID: 500308\n"
"X-Crowdin-Language: ar-SA\n"
"X-Crowdin-Language: ar-AR\n"
"X-Crowdin-File: /dev/src/locale/en_US/LC_MESSAGES/django.po\n"
"X-Crowdin-File-ID: 14\n"

View File

@@ -3,7 +3,7 @@ msgstr ""
"Project-Id-Version: paperless-ngx\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2022-07-08 14:11-0700\n"
"PO-Revision-Date: 2022-07-08 22:07\n"
"PO-Revision-Date: 2022-07-29 20:44\n"
"Last-Translator: \n"
"Language-Team: Belarusian\n"
"Language: be_BY\n"
@@ -100,7 +100,7 @@ msgstr "тыпы дакументаў"
#: documents/models.py:90
msgid "path"
msgstr ""
msgstr "шлях"
#: documents/models.py:96 documents/models.py:124
msgid "storage path"

View File

@@ -3,7 +3,7 @@ msgstr ""
"Project-Id-Version: paperless-ngx\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2022-07-08 14:11-0700\n"
"PO-Revision-Date: 2022-07-08 22:07\n"
"PO-Revision-Date: 2022-09-04 11:44\n"
"Last-Translator: \n"
"Language-Team: German\n"
"Language: de_DE\n"
@@ -376,7 +376,7 @@ msgstr "Filterregeln"
#: documents/models.py:521
msgid "started"
msgstr ""
msgstr "gestartet"
#: documents/serialisers.py:70
#, python-format
@@ -402,7 +402,7 @@ msgstr "Paperless-ngx wird geladen..."
#: documents/templates/index.html:79
msgid "Still here?! Hmm, something might be wrong."
msgstr "Du bist noch hier?! Hmm, da muss wohl etwas schief gelaufen sein."
msgstr "Du bist noch hier? Hmm, da muss wohl etwas schiefgelaufen sein."
#: documents/templates/index.html:79
msgid "Here's a link to the docs."
@@ -654,7 +654,7 @@ msgstr "Als wichtig markieren, markierte E-Mails nicht verarbeiten"
#: paperless_mail/models.py:68
msgid "Tag the mail with specified tag, don't process tagged mails"
msgstr ""
msgstr "Markiere die Mail mit dem angegebenen Tag, verarbeite nicht markierte Mails"
#: paperless_mail/models.py:71
msgid "Use subject as title"

View File

@@ -3,7 +3,7 @@ msgstr ""
"Project-Id-Version: paperless-ngx\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2022-07-08 14:11-0700\n"
"PO-Revision-Date: 2022-07-08 22:07\n"
"PO-Revision-Date: 2022-09-06 20:21\n"
"Last-Translator: \n"
"Language-Team: Finnish\n"
"Language: fi_FI\n"
@@ -376,7 +376,7 @@ msgstr "suodatussäännöt"
#: documents/models.py:521
msgid "started"
msgstr ""
msgstr "aloitettu"
#: documents/serialisers.py:70
#, python-format
@@ -638,11 +638,11 @@ msgstr "Prosessoi kaikki tiedostot, sisältäen \"inline\"-liitteet."
#: paperless_mail/models.py:64
msgid "Delete"
msgstr ""
msgstr "Poista"
#: paperless_mail/models.py:65
msgid "Move to specified folder"
msgstr ""
msgstr "Siirrä määritettyyn kansioon"
#: paperless_mail/models.py:66
msgid "Mark as read, don't process read mails"
@@ -650,117 +650,117 @@ msgstr "Merkitse luetuksi, älä prosessoi luettuja sähköposteja"
#: paperless_mail/models.py:67
msgid "Flag the mail, don't process flagged mails"
msgstr ""
msgstr "Liputa sähköposti, älä käsittele liputettuja sähköposteja"
#: paperless_mail/models.py:68
msgid "Tag the mail with specified tag, don't process tagged mails"
msgstr ""
msgstr "Merkitse viesti määrätyllä tagilla, älä käsittele tageja"
#: paperless_mail/models.py:71
msgid "Use subject as title"
msgstr ""
msgstr "Käytä aihetta otsikkona"
#: paperless_mail/models.py:72
msgid "Use attachment filename as title"
msgstr ""
msgstr "Käytä liitteen tiedostonimeä otsikkona"
#: paperless_mail/models.py:75
msgid "Do not assign a correspondent"
msgstr ""
msgstr "Älä määritä yhteyshenkilöä"
#: paperless_mail/models.py:76
msgid "Use mail address"
msgstr ""
msgstr "Käytä sähköpostiosoitetta"
#: paperless_mail/models.py:77
msgid "Use name (or mail address if not available)"
msgstr ""
msgstr "Käytä nimeä (tai sähköpostiosoitetta, jos ei ole saatavilla)"
#: paperless_mail/models.py:78
msgid "Use correspondent selected below"
msgstr ""
msgstr "Käytä alla valittua yhteyshenkilöä"
#: paperless_mail/models.py:82
msgid "order"
msgstr ""
msgstr "järjestys"
#: paperless_mail/models.py:88
msgid "account"
msgstr ""
msgstr "tili"
#: paperless_mail/models.py:92
msgid "folder"
msgstr ""
msgstr "kansio"
#: paperless_mail/models.py:96
msgid "Subfolders must be separated by a delimiter, often a dot ('.') or slash ('/'), but it varies by mail server."
msgstr ""
msgstr "Alikansiot on erotettava erottimella, usein pisteellä ('.') tai kauttaviivalla ('/'), mutta se vaihtelee postipalvelimen mukaan."
#: paperless_mail/models.py:102
msgid "filter from"
msgstr ""
msgstr "suodata lähettäjä-kenttä"
#: paperless_mail/models.py:108
msgid "filter subject"
msgstr ""
msgstr "suodata aihe"
#: paperless_mail/models.py:114
msgid "filter body"
msgstr ""
msgstr "suodata runko"
#: paperless_mail/models.py:121
msgid "filter attachment filename"
msgstr ""
msgstr "suodata liitteen tiedostonimi"
#: paperless_mail/models.py:126
msgid "Only consume documents which entirely match this filename if specified. Wildcards such as *.pdf or *invoice* are allowed. Case insensitive."
msgstr ""
msgstr "Tuo vain dokumentit jotka täsmäävät täysin tiedostonimen suhteen. Jokerimerkit kuten *.pdf tai *lasku* ovat sallittuja. Kirjainkoko ei merkitse."
#: paperless_mail/models.py:133
msgid "maximum age"
msgstr ""
msgstr "ikä enintään"
#: paperless_mail/models.py:135
msgid "Specified in days."
msgstr ""
msgstr "Määritetty päivinä."
#: paperless_mail/models.py:139
msgid "attachment type"
msgstr ""
msgstr "liitteen tyyppi"
#: paperless_mail/models.py:143
msgid "Inline attachments include embedded images, so it's best to combine this option with a filename filter."
msgstr ""
msgstr "Sisäiset liitteet sisältävät upotettuja kuvia, joten on parasta yhdistää tämä vaihtoehto tiedostonimen suodattimeen."
#: paperless_mail/models.py:149
msgid "action"
msgstr ""
msgstr "toiminto"
#: paperless_mail/models.py:155
msgid "action parameter"
msgstr ""
msgstr "toiminnon parametrit"
#: paperless_mail/models.py:160
msgid "Additional parameter for the action selected above, i.e., the target folder of the move to folder action. Subfolders must be separated by dots."
msgstr ""
msgstr "Yllä valitun toiminnon lisäparametri eli siirrä hakemistoon -toiminnon kohdehakemisto. Alikansiot on erotettava toisistaan pisteillä."
#: paperless_mail/models.py:168
msgid "assign title from"
msgstr ""
msgstr "aseta otsikko kohteesta"
#: paperless_mail/models.py:176
msgid "assign this tag"
msgstr ""
msgstr "määritä tämä tunniste"
#: paperless_mail/models.py:184
msgid "assign this document type"
msgstr ""
msgstr "määritä tämä asiakirjatyyppi"
#: paperless_mail/models.py:188
msgid "assign correspondent from"
msgstr ""
msgstr "määritä kirjeenvaihtaja kohteesta"
#: paperless_mail/models.py:198
msgid "assign this correspondent"
msgstr ""
msgstr "määritä tämä kirjeenvaihtaja"

View File

@@ -3,7 +3,7 @@ msgstr ""
"Project-Id-Version: paperless-ngx\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2022-07-08 14:11-0700\n"
"PO-Revision-Date: 2022-07-08 22:07\n"
"PO-Revision-Date: 2022-09-07 21:41\n"
"Last-Translator: \n"
"Language-Team: French\n"
"Language: fr_FR\n"
@@ -100,15 +100,15 @@ msgstr "types de document"
#: documents/models.py:90
msgid "path"
msgstr ""
msgstr "chemin"
#: documents/models.py:96 documents/models.py:124
msgid "storage path"
msgstr ""
msgstr "chemin de stockage"
#: documents/models.py:97
msgid "storage paths"
msgstr ""
msgstr "chemins de stockage"
#: documents/models.py:105
msgid "Unencrypted"
@@ -376,7 +376,7 @@ msgstr "règles de filtrage"
#: documents/models.py:521
msgid "started"
msgstr ""
msgstr "démarré"
#: documents/serialisers.py:70
#, python-format
@@ -394,7 +394,7 @@ msgstr "Type de fichier %(type)s non pris en charge"
#: documents/serialisers.py:596
msgid "Invalid variable detected."
msgstr ""
msgstr "Variable non valide détectée."
#: documents/templates/index.html:78
msgid "Paperless-ngx is loading..."
@@ -402,11 +402,11 @@ msgstr "Paperless-ngx est en cours de chargement..."
#: documents/templates/index.html:79
msgid "Still here?! Hmm, something might be wrong."
msgstr ""
msgstr "Toujours ici ? Hum, quelque chose a dû mal se passer."
#: documents/templates/index.html:79
msgid "Here's a link to the docs."
msgstr ""
msgstr "Lien vers la documentation."
#: documents/templates/registration/logged_out.html:14
msgid "Paperless-ngx signed out"
@@ -450,7 +450,7 @@ msgstr "Anglais (US)"
#: paperless/settings.py:340
msgid "Belarusian"
msgstr ""
msgstr "Biélorusse"
#: paperless/settings.py:341
msgid "Czech"
@@ -510,11 +510,11 @@ msgstr "Russe"
#: paperless/settings.py:355
msgid "Slovenian"
msgstr ""
msgstr "Slovène"
#: paperless/settings.py:356
msgid "Serbian"
msgstr ""
msgstr "Serbe"
#: paperless/settings.py:357
msgid "Swedish"
@@ -522,11 +522,11 @@ msgstr "Suédois"
#: paperless/settings.py:358
msgid "Turkish"
msgstr ""
msgstr "Turc"
#: paperless/settings.py:359
msgid "Chinese Simplified"
msgstr ""
msgstr "Chinois simplifié"
#: paperless/urls.py:161
msgid "Paperless-ngx administration"
@@ -654,7 +654,7 @@ msgstr "Marquer le courriel, ne pas traiter les courriels marqués"
#: paperless_mail/models.py:68
msgid "Tag the mail with specified tag, don't process tagged mails"
msgstr ""
msgstr "Affecter létiquette spécifée au courrier, ne pas traiter les courriels étiquetés"
#: paperless_mail/models.py:71
msgid "Use subject as title"
@@ -694,7 +694,7 @@ msgstr "répertoire"
#: paperless_mail/models.py:96
msgid "Subfolders must be separated by a delimiter, often a dot ('.') or slash ('/'), but it varies by mail server."
msgstr ""
msgstr "Les sous-dossiers doivent être séparés par un délimiteurs, souvent un point ('.') ou un slash ('/'), en fonction du serveur de messagerie."
#: paperless_mail/models.py:102
msgid "filter from"

View File

@@ -3,7 +3,7 @@ msgstr ""
"Project-Id-Version: paperless-ngx\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2022-07-08 14:11-0700\n"
"PO-Revision-Date: 2022-07-08 22:07\n"
"PO-Revision-Date: 2022-08-03 11:24\n"
"Last-Translator: \n"
"Language-Team: Italian\n"
"Language: it_IT\n"
@@ -376,7 +376,7 @@ msgstr "regole filtro"
#: documents/models.py:521
msgid "started"
msgstr ""
msgstr "avviato"
#: documents/serialisers.py:70
#, python-format
@@ -654,7 +654,7 @@ msgstr "Contrassegna la email, non elaborare le email elaborate."
#: paperless_mail/models.py:68
msgid "Tag the mail with specified tag, don't process tagged mails"
msgstr ""
msgstr "Etichetta la posta con il tag specificato, non processare le email etichettate"
#: paperless_mail/models.py:71
msgid "Use subject as title"

View File

@@ -3,7 +3,7 @@ msgstr ""
"Project-Id-Version: paperless-ngx\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2022-07-08 14:11-0700\n"
"PO-Revision-Date: 2022-07-08 22:07\n"
"PO-Revision-Date: 2022-08-26 20:54\n"
"Last-Translator: \n"
"Language-Team: Dutch\n"
"Language: nl_NL\n"
@@ -100,15 +100,15 @@ msgstr "documenttypen"
#: documents/models.py:90
msgid "path"
msgstr ""
msgstr "pad"
#: documents/models.py:96 documents/models.py:124
msgid "storage path"
msgstr ""
msgstr "opslag pad"
#: documents/models.py:97
msgid "storage paths"
msgstr ""
msgstr "opslag paden"
#: documents/models.py:105
msgid "Unencrypted"
@@ -376,7 +376,7 @@ msgstr "filterregels"
#: documents/models.py:521
msgid "started"
msgstr ""
msgstr "gestart"
#: documents/serialisers.py:70
#, python-format
@@ -394,7 +394,7 @@ msgstr "Bestandstype %(type)s niet ondersteund"
#: documents/serialisers.py:596
msgid "Invalid variable detected."
msgstr ""
msgstr "Ongeldige variabele ontdekt."
#: documents/templates/index.html:78
msgid "Paperless-ngx is loading..."
@@ -402,7 +402,7 @@ msgstr "Paperless-ngx is aan het laden..."
#: documents/templates/index.html:79
msgid "Still here?! Hmm, something might be wrong."
msgstr ""
msgstr "Nog steeds hier?! Hmm, er kan iets mis zijn."
#: documents/templates/index.html:79
msgid "Here's a link to the docs."
@@ -450,7 +450,7 @@ msgstr "Engels (US)"
#: paperless/settings.py:340
msgid "Belarusian"
msgstr ""
msgstr "Wit-Russisch"
#: paperless/settings.py:341
msgid "Czech"
@@ -510,11 +510,11 @@ msgstr "Russisch"
#: paperless/settings.py:355
msgid "Slovenian"
msgstr ""
msgstr "Sloveens"
#: paperless/settings.py:356
msgid "Serbian"
msgstr ""
msgstr "Servisch"
#: paperless/settings.py:357
msgid "Swedish"
@@ -522,11 +522,11 @@ msgstr "Zweeds"
#: paperless/settings.py:358
msgid "Turkish"
msgstr ""
msgstr "Turks"
#: paperless/settings.py:359
msgid "Chinese Simplified"
msgstr ""
msgstr "Chinees (vereenvoudigd)"
#: paperless/urls.py:161
msgid "Paperless-ngx administration"
@@ -654,7 +654,7 @@ msgstr "Markeer de mail, verwerk geen mails met markering"
#: paperless_mail/models.py:68
msgid "Tag the mail with specified tag, don't process tagged mails"
msgstr ""
msgstr "Tag de mail met de opgegeven tag, verwerk geen getagde mails"
#: paperless_mail/models.py:71
msgid "Use subject as title"
@@ -694,7 +694,7 @@ msgstr "map"
#: paperless_mail/models.py:96
msgid "Subfolders must be separated by a delimiter, often a dot ('.') or slash ('/'), but it varies by mail server."
msgstr ""
msgstr "Submappen moeten gescheiden worden door een scheidingsteken, vaak een punt ('.') of slash ('/'), maar het varieert per mailserver."
#: paperless_mail/models.py:102
msgid "filter from"

View File

@@ -3,7 +3,7 @@ msgstr ""
"Project-Id-Version: paperless-ngx\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2022-07-08 14:11-0700\n"
"PO-Revision-Date: 2022-07-08 22:07\n"
"PO-Revision-Date: 2022-08-03 08:59\n"
"Last-Translator: \n"
"Language-Team: Norwegian\n"
"Language: no_NO\n"
@@ -220,7 +220,7 @@ msgstr "kritisk"
#: documents/models.py:325
msgid "group"
msgstr ""
msgstr "gruppe"
#: documents/models.py:327
msgid "message"
@@ -228,11 +228,11 @@ msgstr "melding"
#: documents/models.py:330
msgid "level"
msgstr ""
msgstr "nivå"
#: documents/models.py:339
msgid "log"
msgstr "log"
msgstr "Logg"
#: documents/models.py:340
msgid "logs"
@@ -240,11 +240,11 @@ msgstr "logger"
#: documents/models.py:350 documents/models.py:403
msgid "saved view"
msgstr ""
msgstr "lagret visning"
#: documents/models.py:351
msgid "saved views"
msgstr ""
msgstr "lagrede visninger"
#: documents/models.py:353
msgid "user"
@@ -252,35 +252,35 @@ msgstr "bruker"
#: documents/models.py:357
msgid "show on dashboard"
msgstr ""
msgstr "vis på dashbordet"
#: documents/models.py:360
msgid "show in sidebar"
msgstr ""
msgstr "vis i sidestolpen"
#: documents/models.py:364
msgid "sort field"
msgstr ""
msgstr "sorter felt"
#: documents/models.py:369
msgid "sort reverse"
msgstr ""
msgstr "sorter på baksiden"
#: documents/models.py:374
msgid "title contains"
msgstr ""
msgstr "tittelen inneholder"
#: documents/models.py:375
msgid "content contains"
msgstr ""
msgstr "innholdet inneholder"
#: documents/models.py:376
msgid "ASN is"
msgstr ""
msgstr "ASN er"
#: documents/models.py:377
msgid "correspondent is"
msgstr ""
msgstr "tilsvarendet er"
#: documents/models.py:378
msgid "document type is"
@@ -288,15 +288,15 @@ msgstr "dokumenttype er"
#: documents/models.py:379
msgid "is in inbox"
msgstr ""
msgstr "er i innboksen"
#: documents/models.py:380
msgid "has tag"
msgstr ""
msgstr "har tagg"
#: documents/models.py:381
msgid "has any tag"
msgstr ""
msgstr "har en tag"
#: documents/models.py:382
msgid "created before"
@@ -304,125 +304,125 @@ msgstr "opprettet før"
#: documents/models.py:383
msgid "created after"
msgstr ""
msgstr "opprettet etter"
#: documents/models.py:384
msgid "created year is"
msgstr ""
msgstr "opprettet år er"
#: documents/models.py:385
msgid "created month is"
msgstr ""
msgstr "opprettet måned er"
#: documents/models.py:386
msgid "created day is"
msgstr ""
msgstr "opprettet dag er"
#: documents/models.py:387
msgid "added before"
msgstr ""
msgstr "lagt til før"
#: documents/models.py:388
msgid "added after"
msgstr ""
msgstr "lagt til etter"
#: documents/models.py:389
msgid "modified before"
msgstr ""
msgstr "endret før"
#: documents/models.py:390
msgid "modified after"
msgstr ""
msgstr "endret etter"
#: documents/models.py:391
msgid "does not have tag"
msgstr ""
msgstr "har ikke tagg"
#: documents/models.py:392
msgid "does not have ASN"
msgstr ""
msgstr "har ikke ASN"
#: documents/models.py:393
msgid "title or content contains"
msgstr ""
msgstr "tittel eller innhold inneholder"
#: documents/models.py:394
msgid "fulltext query"
msgstr ""
msgstr "full tekst spørring"
#: documents/models.py:395
msgid "more like this"
msgstr ""
msgstr "mer som dette"
#: documents/models.py:396
msgid "has tags in"
msgstr ""
msgstr "har tags i"
#: documents/models.py:406
msgid "rule type"
msgstr ""
msgstr "Type regel"
#: documents/models.py:408
msgid "value"
msgstr ""
msgstr "verdi"
#: documents/models.py:411
msgid "filter rule"
msgstr ""
msgstr "filtrer regel"
#: documents/models.py:412
msgid "filter rules"
msgstr ""
msgstr "filtrer regler"
#: documents/models.py:521
msgid "started"
msgstr ""
msgstr "startet"
#: documents/serialisers.py:70
#, python-format
msgid "Invalid regular expression: %(error)s"
msgstr ""
msgstr "Ugyldig regulært uttrykk: %(error)s"
#: documents/serialisers.py:191
msgid "Invalid color."
msgstr ""
msgstr "Ugyldig farge."
#: documents/serialisers.py:515
#, python-format
msgid "File type %(type)s not supported"
msgstr ""
msgstr "Filtype %(type)s støttes ikke"
#: documents/serialisers.py:596
msgid "Invalid variable detected."
msgstr ""
msgstr "Ugyldig variabel oppdaget."
#: documents/templates/index.html:78
msgid "Paperless-ngx is loading..."
msgstr ""
msgstr "Paperless-ngx laster..."
#: documents/templates/index.html:79
msgid "Still here?! Hmm, something might be wrong."
msgstr ""
msgstr "Fortsatt her?! Hmm, noe kan være galt."
#: documents/templates/index.html:79
msgid "Here's a link to the docs."
msgstr ""
msgstr "Her er en lenke til dokkene."
#: documents/templates/registration/logged_out.html:14
msgid "Paperless-ngx signed out"
msgstr ""
msgstr "Paperless-ngx logget ut"
#: documents/templates/registration/logged_out.html:59
msgid "You have been successfully logged out. Bye!"
msgstr ""
msgstr "Du har blitt logget ut. Av!"
#: documents/templates/registration/logged_out.html:60
msgid "Sign in again"
msgstr ""
msgstr "Logg inn igjen"
#: documents/templates/registration/login.html:15
msgid "Paperless-ngx sign in"
msgstr ""
msgstr "Paperless-ngx-tegn inn"
#: documents/templates/registration/login.html:61
msgid "Please sign in."
@@ -450,63 +450,63 @@ msgstr "Engelsk (US)"
#: paperless/settings.py:340
msgid "Belarusian"
msgstr "Belarusian"
msgstr "Hviterussisk"
#: paperless/settings.py:341
msgid "Czech"
msgstr "Czech"
msgstr "Tsjekkisk"
#: paperless/settings.py:342
msgid "Danish"
msgstr "Danish"
msgstr "Dansk"
#: paperless/settings.py:343
msgid "German"
msgstr "German"
msgstr "Tysk"
#: paperless/settings.py:344
msgid "English (GB)"
msgstr "English (GB)"
msgstr "Engelsk (GB)"
#: paperless/settings.py:345
msgid "Spanish"
msgstr "Spanish"
msgstr "Spansk"
#: paperless/settings.py:346
msgid "French"
msgstr "French"
msgstr "Fransk"
#: paperless/settings.py:347
msgid "Italian"
msgstr "Italian"
msgstr "Italiensk"
#: paperless/settings.py:348
msgid "Luxembourgish"
msgstr "Luxembourgish"
msgstr "Luxembourgsk"
#: paperless/settings.py:349
msgid "Dutch"
msgstr "Dutch"
msgstr "Nederlandsk"
#: paperless/settings.py:350
msgid "Polish"
msgstr "Polish"
msgstr "Polsk"
#: paperless/settings.py:351
msgid "Portuguese (Brazil)"
msgstr "Portuguese (Brazil)"
msgstr "Portugisisk (Brasil)"
#: paperless/settings.py:352
msgid "Portuguese"
msgstr "Portuguese"
msgstr "Portugisisk"
#: paperless/settings.py:353
msgid "Romanian"
msgstr "Romanian"
msgstr "Rumensk"
#: paperless/settings.py:354
msgid "Russian"
msgstr "Russian"
msgstr "Russisk"
#: paperless/settings.py:355
msgid "Slovenian"
@@ -514,19 +514,19 @@ msgstr "Slovenian"
#: paperless/settings.py:356
msgid "Serbian"
msgstr "Serbian"
msgstr "Serbisk"
#: paperless/settings.py:357
msgid "Swedish"
msgstr "Swedish"
msgstr "Svensk"
#: paperless/settings.py:358
msgid "Turkish"
msgstr "Turkish"
msgstr "Tyrkisk"
#: paperless/settings.py:359
msgid "Chinese Simplified"
msgstr "Chinese Simplified"
msgstr "Kinesisk forenklet"
#: paperless/urls.py:161
msgid "Paperless-ngx administration"
@@ -542,7 +542,7 @@ msgstr "Avanserte innstillinger"
#: paperless_mail/admin.py:47
msgid "Filter"
msgstr "Filter"
msgstr "Filtrer"
#: paperless_mail/admin.py:50
msgid "Paperless will only process mails that match ALL of the filters given below."
@@ -554,19 +554,19 @@ msgstr "Handlinger"
#: paperless_mail/admin.py:67
msgid "The action applied to the mail. This action is only performed when documents were consumed from the mail. Mails without attachments will remain entirely untouched."
msgstr ""
msgstr "Handlingen som brukes på e-posten. Denne handlingen blir bare utført når dokumenter blir forbrukt av e-posten. Mailer uten vedlegg forblir helt urørte."
#: paperless_mail/admin.py:75
msgid "Metadata"
msgstr "Metadata"
msgstr "Nøkkeldata"
#: paperless_mail/admin.py:78
msgid "Assign metadata to documents consumed from this rule automatically. If you do not assign tags, types or correspondents here, paperless will still process all matching rules that you have defined."
msgstr ""
msgstr "Tilordne metadata til dokumenter som brukes fra denne regelen automatisk. Hvis du ikke tilordner etiketter, typer eller korrespondenter her, vil papirløs fremdeles behandle alle matchende regler som du har definert."
#: paperless_mail/apps.py:8
msgid "Paperless mail"
msgstr ""
msgstr "Paperløst e-post"
#: paperless_mail/models.py:8
msgid "mail account"
@@ -586,23 +586,23 @@ msgstr "Bruk SSL"
#: paperless_mail/models.py:14
msgid "Use STARTTLS"
msgstr ""
msgstr "Bruk STARTTLS"
#: paperless_mail/models.py:18
msgid "IMAP server"
msgstr ""
msgstr "IMAP tjener"
#: paperless_mail/models.py:21
msgid "IMAP port"
msgstr ""
msgstr "IMAP port"
#: paperless_mail/models.py:25
msgid "This is usually 143 for unencrypted and STARTTLS connections, and 993 for SSL connections."
msgstr ""
msgstr "Dette er vanligvis 143 for ukrypterte og STARTTLS-tilkoblinger, og 993 for SSL-tilkoblinger."
#: paperless_mail/models.py:31
msgid "IMAP security"
msgstr ""
msgstr "IMAP sikkerhet"
#: paperless_mail/models.py:36
msgid "username"
@@ -618,7 +618,7 @@ msgstr "tegnsett"
#: paperless_mail/models.py:45
msgid "The character set to use when communicating with the mail server, such as 'UTF-8' or 'US-ASCII'."
msgstr ""
msgstr "Tegnet som skal brukes ved kommunikasjon med e-posttjeneren, som for eksempel 'UTF-8' eller 'US-ASCII'."
#: paperless_mail/models.py:56
msgid "mail rule"
@@ -626,141 +626,141 @@ msgstr "e-post regel"
#: paperless_mail/models.py:57
msgid "mail rules"
msgstr ""
msgstr "Epost regler"
#: paperless_mail/models.py:60
msgid "Only process attachments."
msgstr ""
msgstr "Bare behandle vedlegg."
#: paperless_mail/models.py:61
msgid "Process all files, including 'inline' attachments."
msgstr ""
msgstr "Behandle alle filer, inkludert \"inline\"-vedlegg."
#: paperless_mail/models.py:64
msgid "Delete"
msgstr ""
msgstr "Slett"
#: paperless_mail/models.py:65
msgid "Move to specified folder"
msgstr ""
msgstr "Flytt til angitt mappe"
#: paperless_mail/models.py:66
msgid "Mark as read, don't process read mails"
msgstr ""
msgstr "Merk som lest og ikke behandle e-post"
#: paperless_mail/models.py:67
msgid "Flag the mail, don't process flagged mails"
msgstr ""
msgstr "Marker posten, ikke behandle flaggede meldinger"
#: paperless_mail/models.py:68
msgid "Tag the mail with specified tag, don't process tagged mails"
msgstr ""
msgstr "Merk e-post med angitte tag, ikke bruk merkede meldinger"
#: paperless_mail/models.py:71
msgid "Use subject as title"
msgstr ""
msgstr "Bruk emne som tittel"
#: paperless_mail/models.py:72
msgid "Use attachment filename as title"
msgstr ""
msgstr "Bruk vedlagte filnavn som tittel"
#: paperless_mail/models.py:75
msgid "Do not assign a correspondent"
msgstr ""
msgstr "Ikke tildel en korrespondent"
#: paperless_mail/models.py:76
msgid "Use mail address"
msgstr ""
msgstr "Bruk e-postadresse"
#: paperless_mail/models.py:77
msgid "Use name (or mail address if not available)"
msgstr ""
msgstr "Bruk navn (eller e-postadresse hvis det ikke er tilgjengelig)"
#: paperless_mail/models.py:78
msgid "Use correspondent selected below"
msgstr ""
msgstr "Bruk tilsvarende valgt nedenfor"
#: paperless_mail/models.py:82
msgid "order"
msgstr ""
msgstr "ordre"
#: paperless_mail/models.py:88
msgid "account"
msgstr ""
msgstr "konto"
#: paperless_mail/models.py:92
msgid "folder"
msgstr ""
msgstr "mappe"
#: paperless_mail/models.py:96
msgid "Subfolders must be separated by a delimiter, often a dot ('.') or slash ('/'), but it varies by mail server."
msgstr ""
msgstr "Undermapper må være atskilt av en skilletegn, ofte en punktum ('.') eller skråstrek ('/'), men den varierer fra e-postserver."
#: paperless_mail/models.py:102
msgid "filter from"
msgstr ""
msgstr "filtrer fra"
#: paperless_mail/models.py:108
msgid "filter subject"
msgstr ""
msgstr "filtrer emne"
#: paperless_mail/models.py:114
msgid "filter body"
msgstr ""
msgstr "filtrer innhold"
#: paperless_mail/models.py:121
msgid "filter attachment filename"
msgstr ""
msgstr "filtrer vedlagte filnavn"
#: paperless_mail/models.py:126
msgid "Only consume documents which entirely match this filename if specified. Wildcards such as *.pdf or *invoice* are allowed. Case insensitive."
msgstr ""
msgstr "Bare bruke dokumenter som samsvarer med dette filnavnet hvis angitt. Jokertegn som *.pdf eller *faktura* er tillatt. Saksfortegnet."
#: paperless_mail/models.py:133
msgid "maximum age"
msgstr ""
msgstr "maksimal alder"
#: paperless_mail/models.py:135
msgid "Specified in days."
msgstr ""
msgstr "Spesifisert i dager"
#: paperless_mail/models.py:139
msgid "attachment type"
msgstr ""
msgstr "vedlegg type"
#: paperless_mail/models.py:143
msgid "Inline attachments include embedded images, so it's best to combine this option with a filename filter."
msgstr ""
msgstr "Innebygde vedlegg inkluderer innebygde bilder, så det er best å kombinere dette alternativet med et filter."
#: paperless_mail/models.py:149
msgid "action"
msgstr ""
msgstr "handling"
#: paperless_mail/models.py:155
msgid "action parameter"
msgstr ""
msgstr "parameter for handling"
#: paperless_mail/models.py:160
msgid "Additional parameter for the action selected above, i.e., the target folder of the move to folder action. Subfolders must be separated by dots."
msgstr ""
msgstr "Ytterligere parameter for handlingen valgt ovenfor, dvs. målmappen for flytting til mappehandling. Undermapper må separeres med punkter."
#: paperless_mail/models.py:168
msgid "assign title from"
msgstr ""
msgstr "tilordne tittel fra"
#: paperless_mail/models.py:176
msgid "assign this tag"
msgstr ""
msgstr "tilordne denne taggen"
#: paperless_mail/models.py:184
msgid "assign this document type"
msgstr ""
msgstr "tilordne denne dokumenttypen"
#: paperless_mail/models.py:188
msgid "assign correspondent from"
msgstr ""
msgstr "Tildel korrespondent fra"
#: paperless_mail/models.py:198
msgid "assign this correspondent"
msgstr ""
msgstr "Tildel denne korrespondenten"

View File

@@ -3,7 +3,7 @@ msgstr ""
"Project-Id-Version: paperless-ngx\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2022-07-08 14:11-0700\n"
"PO-Revision-Date: 2022-07-08 22:07\n"
"PO-Revision-Date: 2022-08-17 11:20\n"
"Last-Translator: \n"
"Language-Team: Polish\n"
"Language: pl_PL\n"
@@ -376,7 +376,7 @@ msgstr "reguły filtrowania"
#: documents/models.py:521
msgid "started"
msgstr ""
msgstr "start"
#: documents/serialisers.py:70
#, python-format
@@ -654,7 +654,7 @@ msgstr "Oznacz wiadomość, nie przetwarzaj oznaczonych wiadomości"
#: paperless_mail/models.py:68
msgid "Tag the mail with specified tag, don't process tagged mails"
msgstr ""
msgstr "Oznacz pocztę z podanym tagiem, nie przetwarzaj otagowanych wiadomości"
#: paperless_mail/models.py:71
msgid "Use subject as title"

View File

@@ -3,7 +3,7 @@ msgstr ""
"Project-Id-Version: paperless-ngx\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2022-07-08 14:11-0700\n"
"PO-Revision-Date: 2022-07-08 22:07\n"
"PO-Revision-Date: 2022-08-03 16:12\n"
"Last-Translator: \n"
"Language-Team: Russian\n"
"Language: ru_RU\n"
@@ -100,15 +100,15 @@ msgstr "типы документов"
#: documents/models.py:90
msgid "path"
msgstr ""
msgstr "путь"
#: documents/models.py:96 documents/models.py:124
msgid "storage path"
msgstr ""
msgstr "путь к хранилищу"
#: documents/models.py:97
msgid "storage paths"
msgstr ""
msgstr "пути хранения"
#: documents/models.py:105
msgid "Unencrypted"
@@ -376,7 +376,7 @@ msgstr "правила фильтрации"
#: documents/models.py:521
msgid "started"
msgstr ""
msgstr "запущено"
#: documents/serialisers.py:70
#, python-format
@@ -394,7 +394,7 @@ msgstr "Тип файла %(type)s не поддерживается"
#: documents/serialisers.py:596
msgid "Invalid variable detected."
msgstr ""
msgstr "Обнаружена неверная переменная."
#: documents/templates/index.html:78
msgid "Paperless-ngx is loading..."
@@ -402,11 +402,11 @@ msgstr "Paperless-ngx загружается..."
#: documents/templates/index.html:79
msgid "Still here?! Hmm, something might be wrong."
msgstr ""
msgstr "Все еще здесь?! Хмм, возможно что-то не так."
#: documents/templates/index.html:79
msgid "Here's a link to the docs."
msgstr ""
msgstr "Вот ссылка на документацию."
#: documents/templates/registration/logged_out.html:14
msgid "Paperless-ngx signed out"
@@ -450,7 +450,7 @@ msgstr "Английский (США)"
#: paperless/settings.py:340
msgid "Belarusian"
msgstr ""
msgstr "Белорусский"
#: paperless/settings.py:341
msgid "Czech"
@@ -510,11 +510,11 @@ msgstr "Русский"
#: paperless/settings.py:355
msgid "Slovenian"
msgstr ""
msgstr "Словенский"
#: paperless/settings.py:356
msgid "Serbian"
msgstr ""
msgstr "Сербский"
#: paperless/settings.py:357
msgid "Swedish"
@@ -522,11 +522,11 @@ msgstr "Шведский"
#: paperless/settings.py:358
msgid "Turkish"
msgstr ""
msgstr "Турецкий"
#: paperless/settings.py:359
msgid "Chinese Simplified"
msgstr ""
msgstr "Китайский упрощенный"
#: paperless/urls.py:161
msgid "Paperless-ngx administration"
@@ -654,7 +654,7 @@ msgstr "Пометить почту, не обрабатывать помече
#: paperless_mail/models.py:68
msgid "Tag the mail with specified tag, don't process tagged mails"
msgstr ""
msgstr "Отметить почту указанным тегом, не обрабатывать помеченные письма"
#: paperless_mail/models.py:71
msgid "Use subject as title"
@@ -694,7 +694,7 @@ msgstr "каталог"
#: paperless_mail/models.py:96
msgid "Subfolders must be separated by a delimiter, often a dot ('.') or slash ('/'), but it varies by mail server."
msgstr ""
msgstr "Подпапки должны быть отделены разделителем, часто точкой ('.') или косой чертой ('/'), но это зависит от почтового сервера."
#: paperless_mail/models.py:102
msgid "filter from"

View File

@@ -3,7 +3,7 @@ msgstr ""
"Project-Id-Version: paperless-ngx\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2022-07-08 14:11-0700\n"
"PO-Revision-Date: 2022-07-08 22:07\n"
"PO-Revision-Date: 2022-08-25 12:46\n"
"Last-Translator: \n"
"Language-Team: Slovenian\n"
"Language: sl_SI\n"
@@ -100,15 +100,15 @@ msgstr "vrste dokumentov"
#: documents/models.py:90
msgid "path"
msgstr ""
msgstr "pot"
#: documents/models.py:96 documents/models.py:124
msgid "storage path"
msgstr ""
msgstr "pot do shrambe"
#: documents/models.py:97
msgid "storage paths"
msgstr ""
msgstr "poti do shrambe"
#: documents/models.py:105
msgid "Unencrypted"
@@ -376,7 +376,7 @@ msgstr "filtriraj pravila"
#: documents/models.py:521
msgid "started"
msgstr ""
msgstr "zagnano"
#: documents/serialisers.py:70
#, python-format
@@ -394,7 +394,7 @@ msgstr "Vrsta datoteke %(type)s ni podprta"
#: documents/serialisers.py:596
msgid "Invalid variable detected."
msgstr ""
msgstr "Zaznani neveljavni znaki."
#: documents/templates/index.html:78
msgid "Paperless-ngx is loading..."
@@ -402,11 +402,11 @@ msgstr "Paperless-ngx se nalaga..."
#: documents/templates/index.html:79
msgid "Still here?! Hmm, something might be wrong."
msgstr ""
msgstr "Še vedno tam? Hmm, kot kaže je šlo nekaj narobe."
#: documents/templates/index.html:79
msgid "Here's a link to the docs."
msgstr ""
msgstr "Tu je povezava do dokumentacije."
#: documents/templates/registration/logged_out.html:14
msgid "Paperless-ngx signed out"
@@ -450,7 +450,7 @@ msgstr "Angleščina (ZDA)"
#: paperless/settings.py:340
msgid "Belarusian"
msgstr ""
msgstr "Beloruščina"
#: paperless/settings.py:341
msgid "Czech"
@@ -510,11 +510,11 @@ msgstr "Ruščina"
#: paperless/settings.py:355
msgid "Slovenian"
msgstr ""
msgstr "Slovenščina"
#: paperless/settings.py:356
msgid "Serbian"
msgstr ""
msgstr "Srbščina"
#: paperless/settings.py:357
msgid "Swedish"
@@ -522,11 +522,11 @@ msgstr "Švedščina"
#: paperless/settings.py:358
msgid "Turkish"
msgstr ""
msgstr "Turščina"
#: paperless/settings.py:359
msgid "Chinese Simplified"
msgstr ""
msgstr "Poenostavljena kitajščina"
#: paperless/urls.py:161
msgid "Paperless-ngx administration"
@@ -654,7 +654,7 @@ msgstr "Označite pošto z zastavico, ne obdelujte označene pošte"
#: paperless_mail/models.py:68
msgid "Tag the mail with specified tag, don't process tagged mails"
msgstr ""
msgstr "Označi pošto s določeno oznako, ne procesiraj označene pošte"
#: paperless_mail/models.py:71
msgid "Use subject as title"
@@ -694,7 +694,7 @@ msgstr "mapa"
#: paperless_mail/models.py:96
msgid "Subfolders must be separated by a delimiter, often a dot ('.') or slash ('/'), but it varies by mail server."
msgstr ""
msgstr "Podmape morajo biti ločene s znakom, običajno je to pika (.) ali slash ('/'), je pa odvisno od poštnega strežnika."
#: paperless_mail/models.py:102
msgid "filter from"

View File

@@ -3,7 +3,7 @@ msgstr ""
"Project-Id-Version: paperless-ngx\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2022-07-08 14:11-0700\n"
"PO-Revision-Date: 2022-07-08 22:07\n"
"PO-Revision-Date: 2022-08-04 23:55\n"
"Last-Translator: \n"
"Language-Team: Serbian (Latin)\n"
"Language: sr_CS\n"
@@ -60,15 +60,15 @@ msgstr "algoritam podudaranja"
#: documents/models.py:47
msgid "is insensitive"
msgstr ""
msgstr "bez razlike veliko/malo slovo"
#: documents/models.py:60 documents/models.py:115
msgid "correspondent"
msgstr "dopisnik"
msgstr "korespodent"
#: documents/models.py:61
msgid "correspondents"
msgstr "dopisnici"
msgstr "korespodenti"
#: documents/models.py:66
msgid "color"
@@ -80,7 +80,7 @@ msgstr "je oznaka prijemnog sandučeta"
#: documents/models.py:72
msgid "Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags."
msgstr ""
msgstr "Označava ovu oznaku kao oznaku prijemnog sandučeta (inbox): Svi novoobrađeni dokumenti će biti označeni oznakama prijemnog sandučeta (inbox)."
#: documents/models.py:78
msgid "tag"
@@ -100,23 +100,23 @@ msgstr "tipovi dokumenta"
#: documents/models.py:90
msgid "path"
msgstr ""
msgstr "putanja"
#: documents/models.py:96 documents/models.py:124
msgid "storage path"
msgstr ""
msgstr "putanja skladišta"
#: documents/models.py:97
msgid "storage paths"
msgstr ""
msgstr "putanja skladišta"
#: documents/models.py:105
msgid "Unencrypted"
msgstr ""
msgstr "Nešifrovano"
#: documents/models.py:106
msgid "Encrypted with GNU Privacy Guard"
msgstr ""
msgstr "Šifrovano pomoću GNU Privacy Guard"
#: documents/models.py:127
msgid "title"
@@ -128,7 +128,7 @@ msgstr "sadržaj"
#: documents/models.py:142
msgid "The raw, text-only data of the document. This field is primarily used for searching."
msgstr ""
msgstr "Neobrađeni tekstualni podaci dokumenta. Ovo se polje koristi prvenstveno za pretraživanje."
#: documents/models.py:147
msgid "mime type"
@@ -172,7 +172,7 @@ msgstr "naziv fajla"
#: documents/models.py:204
msgid "Current filename in storage"
msgstr ""
msgstr "Trenutni naziv sačuvane datoteke"
#: documents/models.py:208
msgid "archive filename"
@@ -180,7 +180,7 @@ msgstr "naziv fajla arhive"
#: documents/models.py:214
msgid "Current archive filename in storage"
msgstr ""
msgstr "Trenutni naziv arhivirane sačuvane datoteke"
#: documents/models.py:218
msgid "archive serial number"
@@ -188,7 +188,7 @@ msgstr "arhivski serijski broj"
#: documents/models.py:224
msgid "The position of this document in your physical document archive."
msgstr ""
msgstr "Položaj ovog dokumenta u vašoj fizičkoj arhivi dokumenata."
#: documents/models.py:230
msgid "document"
@@ -264,7 +264,7 @@ msgstr "polje za sortiranje"
#: documents/models.py:369
msgid "sort reverse"
msgstr ""
msgstr "obrnuto sortiranje"
#: documents/models.py:374
msgid "title contains"
@@ -280,7 +280,7 @@ msgstr "ASN je"
#: documents/models.py:377
msgid "correspondent is"
msgstr "dopisnik je"
msgstr "korespodent je"
#: documents/models.py:378
msgid "document type is"
@@ -348,7 +348,7 @@ msgstr "naslov i sadržaj sadrži"
#: documents/models.py:394
msgid "fulltext query"
msgstr ""
msgstr "upit za ceo tekst"
#: documents/models.py:395
msgid "more like this"
@@ -376,12 +376,12 @@ msgstr "filter pravila"
#: documents/models.py:521
msgid "started"
msgstr ""
msgstr "pokrenuto"
#: documents/serialisers.py:70
#, python-format
msgid "Invalid regular expression: %(error)s"
msgstr ""
msgstr "Nevažeći regularni izraz: %(error)s"
#: documents/serialisers.py:191
msgid "Invalid color."
@@ -390,11 +390,11 @@ msgstr "Nevažeća boja."
#: documents/serialisers.py:515
#, python-format
msgid "File type %(type)s not supported"
msgstr ""
msgstr "Vrsta datoteke %(type)s nije podržana"
#: documents/serialisers.py:596
msgid "Invalid variable detected."
msgstr ""
msgstr "Otkrivena je nevažeća promenljiva."
#: documents/templates/index.html:78
msgid "Paperless-ngx is loading..."
@@ -402,19 +402,19 @@ msgstr "Paperless-ngx se učitava..."
#: documents/templates/index.html:79
msgid "Still here?! Hmm, something might be wrong."
msgstr ""
msgstr "Još uvek si ovde?! Hmm, možda nešto nije u redu."
#: documents/templates/index.html:79
msgid "Here's a link to the docs."
msgstr ""
msgstr "Veze ka dokumentima."
#: documents/templates/registration/logged_out.html:14
msgid "Paperless-ngx signed out"
msgstr ""
msgstr "Paperless-ngx odjavljen"
#: documents/templates/registration/logged_out.html:59
msgid "You have been successfully logged out. Bye!"
msgstr ""
msgstr "Uspešno ste se odjavili!"
#: documents/templates/registration/logged_out.html:60
msgid "Sign in again"
@@ -422,7 +422,7 @@ msgstr "Prijavitе sе ponovo"
#: documents/templates/registration/login.html:15
msgid "Paperless-ngx sign in"
msgstr ""
msgstr "Paperless-ngx prijava"
#: documents/templates/registration/login.html:61
msgid "Please sign in."
@@ -430,7 +430,7 @@ msgstr "Prijavite se."
#: documents/templates/registration/login.html:64
msgid "Your username and password didn't match. Please try again."
msgstr ""
msgstr "Vaše korisničko ime i lozinka ne odgovaraju. Molimo pokušajte ponovo."
#: documents/templates/registration/login.html:67
msgid "Username"
@@ -450,7 +450,7 @@ msgstr "Engleski (US)"
#: paperless/settings.py:340
msgid "Belarusian"
msgstr ""
msgstr "Beloruski"
#: paperless/settings.py:341
msgid "Czech"
@@ -510,11 +510,11 @@ msgstr "Ruski"
#: paperless/settings.py:355
msgid "Slovenian"
msgstr ""
msgstr "Slovenački"
#: paperless/settings.py:356
msgid "Serbian"
msgstr ""
msgstr "Srpski"
#: paperless/settings.py:357
msgid "Swedish"
@@ -522,11 +522,11 @@ msgstr "Švedski"
#: paperless/settings.py:358
msgid "Turkish"
msgstr ""
msgstr "Turski"
#: paperless/settings.py:359
msgid "Chinese Simplified"
msgstr ""
msgstr "Kineski pojednostavljen"
#: paperless/urls.py:161
msgid "Paperless-ngx administration"
@@ -534,7 +534,7 @@ msgstr "Paperless-ngx administracija"
#: paperless_mail/admin.py:29
msgid "Authentication"
msgstr ""
msgstr "Autentifikacija"
#: paperless_mail/admin.py:30
msgid "Advanced settings"
@@ -546,7 +546,7 @@ msgstr "Filter"
#: paperless_mail/admin.py:50
msgid "Paperless will only process mails that match ALL of the filters given below."
msgstr ""
msgstr "Paperless-ngx će obrađivati samo e-poštu koja odgovara SVIM filterima navedenim u nastavku."
#: paperless_mail/admin.py:64
msgid "Actions"
@@ -554,7 +554,7 @@ msgstr "Radnje"
#: paperless_mail/admin.py:67
msgid "The action applied to the mail. This action is only performed when documents were consumed from the mail. Mails without attachments will remain entirely untouched."
msgstr ""
msgstr "Akcija se odnosi na e-poštu. Ova se radnja izvodi samo ako su dokumenti konzumirani iz e-pošte. E-pošta bez priloga ostat će u potpunosti netaknuta."
#: paperless_mail/admin.py:75
msgid "Metadata"
@@ -562,7 +562,7 @@ msgstr "Metapodaci"
#: paperless_mail/admin.py:78
msgid "Assign metadata to documents consumed from this rule automatically. If you do not assign tags, types or correspondents here, paperless will still process all matching rules that you have defined."
msgstr ""
msgstr "Automatski dodelite metapodatke dokumentima koji se koriste iz ovog pravila. Ako ne dodelite oznaku, vrstu ili korespodenta, Paperless-ngx će i dalje obraditi sva pravila podudaranja koja ste definisali."
#: paperless_mail/apps.py:8
msgid "Paperless mail"
@@ -578,7 +578,7 @@ msgstr "mejl nalozi"
#: paperless_mail/models.py:12
msgid "No encryption"
msgstr ""
msgstr "Nema enkripcije"
#: paperless_mail/models.py:13
msgid "Use SSL"
@@ -598,7 +598,7 @@ msgstr "IMAP port"
#: paperless_mail/models.py:25
msgid "This is usually 143 for unencrypted and STARTTLS connections, and 993 for SSL connections."
msgstr ""
msgstr "Uobičajno 143 za nešifrovane i STARTTLS veze, a 993 za SSL veze."
#: paperless_mail/models.py:31
msgid "IMAP security"
@@ -618,23 +618,23 @@ msgstr "karakter set"
#: paperless_mail/models.py:45
msgid "The character set to use when communicating with the mail server, such as 'UTF-8' or 'US-ASCII'."
msgstr ""
msgstr "Skup znakova koji se koristi pri komunikaciji sa mejl serverom, poput 'UTF-8' ili 'US-ASCII'."
#: paperless_mail/models.py:56
msgid "mail rule"
msgstr ""
msgstr "pravilo e-pošte"
#: paperless_mail/models.py:57
msgid "mail rules"
msgstr ""
msgstr "pravila e-pošte"
#: paperless_mail/models.py:60
msgid "Only process attachments."
msgstr ""
msgstr "Obradi samo priloge."
#: paperless_mail/models.py:61
msgid "Process all files, including 'inline' attachments."
msgstr ""
msgstr "Obradite sve datoteke, uključujući \"umetnute\" priloge."
#: paperless_mail/models.py:64
msgid "Delete"
@@ -642,31 +642,31 @@ msgstr "Obriši"
#: paperless_mail/models.py:65
msgid "Move to specified folder"
msgstr ""
msgstr "Premesti u određen folder"
#: paperless_mail/models.py:66
msgid "Mark as read, don't process read mails"
msgstr ""
msgstr "Označi kao pročitano. Ne obrađuj pročitanu e-poštu"
#: paperless_mail/models.py:67
msgid "Flag the mail, don't process flagged mails"
msgstr ""
msgstr "Označi poštu zastavicom. Ne obrađuj e-poštu sa zastavicom"
#: paperless_mail/models.py:68
msgid "Tag the mail with specified tag, don't process tagged mails"
msgstr ""
msgstr "Označite poštu specifičnom oznakom. Ne obrađuj e-poštu s specifičnom oznakom"
#: paperless_mail/models.py:71
msgid "Use subject as title"
msgstr ""
msgstr "Koristi predmet kao naziv"
#: paperless_mail/models.py:72
msgid "Use attachment filename as title"
msgstr ""
msgstr "Koristi naziv datoteke priloga kao naziv"
#: paperless_mail/models.py:75
msgid "Do not assign a correspondent"
msgstr "Ne dodeljuj dopisnika"
msgstr "Ne dodeljuj korespodenta"
#: paperless_mail/models.py:76
msgid "Use mail address"
@@ -678,7 +678,7 @@ msgstr "Koristi naziv (ili mejl adresu ako nije dostupno)"
#: paperless_mail/models.py:78
msgid "Use correspondent selected below"
msgstr "Koristi dopisnika ispod"
msgstr "Koristi koreespodenta ispod"
#: paperless_mail/models.py:82
msgid "order"
@@ -694,7 +694,7 @@ msgstr "folder"
#: paperless_mail/models.py:96
msgid "Subfolders must be separated by a delimiter, often a dot ('.') or slash ('/'), but it varies by mail server."
msgstr ""
msgstr "Podfolderi moraju biti odvojeni separatorom, često tačkom ('.') ili kosom crtom ('/'), ali to se razlikuje zavisno od servera e-pošte."
#: paperless_mail/models.py:102
msgid "filter from"
@@ -714,15 +714,15 @@ msgstr "filter naziv fajla priloga"
#: paperless_mail/models.py:126
msgid "Only consume documents which entirely match this filename if specified. Wildcards such as *.pdf or *invoice* are allowed. Case insensitive."
msgstr ""
msgstr "Konzumirajte samo dokumente koji u potpunosti odgovaraju ovom nazivu datoteke ako je navedeno. Dopušteni su zamenski znakovi kao što su *.pdf ili *faktura*. Neosetljivo je na mala i mala slova."
#: paperless_mail/models.py:133
msgid "maximum age"
msgstr ""
msgstr "maksimalna starost"
#: paperless_mail/models.py:135
msgid "Specified in days."
msgstr ""
msgstr "Navedeno u danima."
#: paperless_mail/models.py:139
msgid "attachment type"
@@ -730,7 +730,7 @@ msgstr "tip priloga"
#: paperless_mail/models.py:143
msgid "Inline attachments include embedded images, so it's best to combine this option with a filename filter."
msgstr ""
msgstr "Ugrađeni prilozi uključuju ugrađene slike, pa je najbolje kombinovati ovu opciju s filterom naziva datoteke."
#: paperless_mail/models.py:149
msgid "action"
@@ -738,11 +738,11 @@ msgstr "radnja"
#: paperless_mail/models.py:155
msgid "action parameter"
msgstr ""
msgstr "parametar akcije"
#: paperless_mail/models.py:160
msgid "Additional parameter for the action selected above, i.e., the target folder of the move to folder action. Subfolders must be separated by dots."
msgstr ""
msgstr "Dodatni parametar za gore odabranu akciju, tj. ciljani folder za premeštanje u folder akcije. Podfolderi moraju biti odvojeni tačkama."
#: paperless_mail/models.py:168
msgid "assign title from"
@@ -758,9 +758,9 @@ msgstr "dodeli ovaj tip dokumenta"
#: paperless_mail/models.py:188
msgid "assign correspondent from"
msgstr "dodeli dopisnika iz"
msgstr "dodeli korespodenta iz"
#: paperless_mail/models.py:198
msgid "assign this correspondent"
msgstr "dodeli ovog dopisnika"
msgstr "dodeli ovog korspodenta"

View File

@@ -3,7 +3,7 @@ msgstr ""
"Project-Id-Version: paperless-ngx\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2022-07-08 14:11-0700\n"
"PO-Revision-Date: 2022-07-08 22:07\n"
"PO-Revision-Date: 2022-08-01 19:02\n"
"Last-Translator: \n"
"Language-Team: Turkish\n"
"Language: tr_TR\n"
@@ -80,7 +80,7 @@ msgstr "gelen kutu etiketidir"
#: documents/models.py:72
msgid "Marks this tag as an inbox tag: All newly consumed documents will be tagged with inbox tags."
msgstr "Bu etiketi, gelen kutusu etiketi olarak işaretle: Tüm yeni olarak tüketilen dökümanlar gelen kutusu etiketi ile etiketlendirileceklerdir."
msgstr "Bu etiketi, gelen kutusu etiketi olarak işaretle: Yeni aktarılan tüm dokümanlar gelen kutusu etiketi ile etiketlendirileceklerdir."
#: documents/models.py:78
msgid "tag"
@@ -376,7 +376,7 @@ msgstr "filtreleme kuralları"
#: documents/models.py:521
msgid "started"
msgstr ""
msgstr "başladı"
#: documents/serialisers.py:70
#, python-format
@@ -394,7 +394,7 @@ msgstr "Dosya türü %(type)s desteklenmiyor"
#: documents/serialisers.py:596
msgid "Invalid variable detected."
msgstr ""
msgstr "Geçersiz değişken algılandı."
#: documents/templates/index.html:78
msgid "Paperless-ngx is loading..."
@@ -402,7 +402,7 @@ msgstr "Paperless-ngx yükleniyor..."
#: documents/templates/index.html:79
msgid "Still here?! Hmm, something might be wrong."
msgstr ""
msgstr "Hâlâ burada mısınız? Hmm, bir şeyler yanlış olabilir."
#: documents/templates/index.html:79
msgid "Here's a link to the docs."
@@ -450,7 +450,7 @@ msgstr "İngilizce (Birleşik Devletler)"
#: paperless/settings.py:340
msgid "Belarusian"
msgstr ""
msgstr "Belarusça"
#: paperless/settings.py:341
msgid "Czech"
@@ -510,11 +510,11 @@ msgstr "Rusça"
#: paperless/settings.py:355
msgid "Slovenian"
msgstr ""
msgstr "Slovakça"
#: paperless/settings.py:356
msgid "Serbian"
msgstr ""
msgstr "Sırpça"
#: paperless/settings.py:357
msgid "Swedish"
@@ -522,11 +522,11 @@ msgstr "İsveççe"
#: paperless/settings.py:358
msgid "Turkish"
msgstr ""
msgstr "Türkçe"
#: paperless/settings.py:359
msgid "Chinese Simplified"
msgstr ""
msgstr "Basitleştirilmiş Çince"
#: paperless/urls.py:161
msgid "Paperless-ngx administration"

View File

@@ -3,7 +3,7 @@ msgstr ""
"Project-Id-Version: paperless-ngx\n"
"Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2022-07-08 14:11-0700\n"
"PO-Revision-Date: 2022-07-08 22:07\n"
"PO-Revision-Date: 2022-07-15 04:02\n"
"Last-Translator: \n"
"Language-Team: Chinese Simplified\n"
"Language: zh_CN\n"
@@ -376,7 +376,7 @@ msgstr "过滤规则"
#: documents/models.py:521
msgid "started"
msgstr ""
msgstr "已开始"
#: documents/serialisers.py:70
#, python-format
@@ -654,7 +654,7 @@ msgstr "标记邮件,不处理已标记的邮件"
#: paperless_mail/models.py:68
msgid "Tag the mail with specified tag, don't process tagged mails"
msgstr ""
msgstr "用指定标签标记邮件,不要处理已标记的邮件"
#: paperless_mail/models.py:71
msgid "Use subject as title"

View File

@@ -1,4 +1,11 @@
from .celery import app as celery_app
from .checks import binaries_check
from .checks import paths_check
from .checks import settings_values_check
__all__ = ["binaries_check", "paths_check"]
__all__ = [
"celery_app",
"binaries_check",
"paths_check",
"settings_values_check",
]

17
src/paperless/celery.py Normal file
View File

@@ -0,0 +1,17 @@
import os
from celery import Celery
# Set the default Django settings module for the 'celery' program.
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "paperless.settings")
app = Celery("paperless")
# Using a string here means the worker doesn't have to serialize
# the configuration object to child processes.
# - namespace='CELERY' means all celery-related configuration keys
# should have a `CELERY_` prefix.
app.config_from_object("django.conf:settings", namespace="CELERY")
# Load task modules from all registered Django apps.
app.autodiscover_tasks()

View File

@@ -1,4 +1,6 @@
import grp
import os
import pwd
import shutil
import stat
@@ -32,12 +34,15 @@ def path_check(var, directory):
with open(test_file, "w"):
pass
except PermissionError:
dir_stat = os.stat(directory)
dir_mode = stat.filemode(dir_stat.st_mode)
dir_owner = pwd.getpwuid(dir_stat.st_uid).pw_name
dir_group = grp.getgrgid(dir_stat.st_gid).gr_name
messages.append(
Error(
writeable_message.format(var),
writeable_hint.format(
f"\n{stat.filemode(os.stat(directory).st_mode)} "
f"{directory}\n",
f"\n{dir_mode} {dir_owner} {dir_group} " f"{directory}\n",
),
),
)
@@ -96,3 +101,52 @@ def debug_mode_check(app_configs, **kwargs):
]
else:
return []
@register()
def settings_values_check(app_configs, **kwargs):
"""
Validates at least some of the user provided settings
"""
def _ocrmypdf_settings_check():
"""
Validates some of the arguments which will be provided to ocrmypdf
against the valid options. Use "ocrmypdf --help" to see the valid
inputs
"""
msgs = []
if settings.OCR_OUTPUT_TYPE not in {
"pdfa",
"pdf",
"pdfa-1",
"pdfa-2",
"pdfa-3",
}:
msgs.append(
Error(f'OCR output type "{settings.OCR_OUTPUT_TYPE}" is not valid'),
)
if settings.OCR_MODE not in {"force", "skip", "redo", "skip_noarchive"}:
msgs.append(Error(f'OCR output mode "{settings.OCR_MODE}" is not valid'))
if settings.OCR_CLEAN not in {"clean", "clean-final", "none"}:
msgs.append(Error(f'OCR clean mode "{settings.OCR_CLEAN}" is not valid'))
return msgs
def _timezone_validate():
"""
Validates the user provided timezone is a valid timezone
"""
try:
import zoneinfo
except ImportError: # pragma: nocover
import backports.zoneinfo as zoneinfo
msgs = []
if settings.TIME_ZONE not in zoneinfo.available_timezones():
msgs.append(
Error(f'Timezone "{settings.TIME_ZONE}" is not a valid timezone'),
)
return msgs
return _ocrmypdf_settings_check() + _timezone_validate()

View File

@@ -4,11 +4,13 @@ import math
import multiprocessing
import os
import re
import tempfile
from typing import Final
from typing import Optional
from typing import Set
from urllib.parse import urlparse
from celery.schedules import crontab
from concurrent_log_handler.queue import setup_logging_queues
from django.utils.translation import gettext_lazy as _
from dotenv import load_dotenv
@@ -56,6 +58,13 @@ def __get_float(key: str, default: float) -> float:
return float(os.getenv(key, default))
def __get_path(key: str, default: str) -> str:
"""
Return a normalized, absolute path based on the environment variable or a default
"""
return os.path.abspath(os.path.normpath(os.environ.get(key, default)))
# NEVER RUN WITH DEBUG IN PRODUCTION.
DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO")
@@ -66,14 +75,16 @@ DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO")
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
STATIC_ROOT = os.getenv("PAPERLESS_STATICDIR", os.path.join(BASE_DIR, "..", "static"))
STATIC_ROOT = __get_path("PAPERLESS_STATICDIR", os.path.join(BASE_DIR, "..", "static"))
MEDIA_ROOT = os.getenv("PAPERLESS_MEDIA_ROOT", os.path.join(BASE_DIR, "..", "media"))
MEDIA_ROOT = __get_path("PAPERLESS_MEDIA_ROOT", os.path.join(BASE_DIR, "..", "media"))
ORIGINALS_DIR = os.path.join(MEDIA_ROOT, "documents", "originals")
ARCHIVE_DIR = os.path.join(MEDIA_ROOT, "documents", "archive")
THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails")
DATA_DIR = os.getenv("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data"))
DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data"))
NLTK_DIR = os.path.join(DATA_DIR, "nltk")
TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR")
@@ -83,15 +94,18 @@ MEDIA_LOCK = os.path.join(MEDIA_ROOT, "media.lock")
INDEX_DIR = os.path.join(DATA_DIR, "index")
MODEL_FILE = os.path.join(DATA_DIR, "classification_model.pickle")
LOGGING_DIR = os.getenv("PAPERLESS_LOGGING_DIR", os.path.join(DATA_DIR, "log"))
LOGGING_DIR = __get_path("PAPERLESS_LOGGING_DIR", os.path.join(DATA_DIR, "log"))
CONSUMPTION_DIR = os.getenv(
CONSUMPTION_DIR = __get_path(
"PAPERLESS_CONSUMPTION_DIR",
os.path.join(BASE_DIR, "..", "consume"),
)
# This will be created if it doesn't exist
SCRATCH_DIR = os.getenv("PAPERLESS_SCRATCH_DIR", "/tmp/paperless")
SCRATCH_DIR = __get_path(
"PAPERLESS_SCRATCH_DIR",
os.path.join(tempfile.gettempdir(), "paperless"),
)
###############################################################################
# Application Definition #
@@ -117,7 +131,7 @@ INSTALLED_APPS = [
"rest_framework",
"rest_framework.authtoken",
"django_filters",
"django_q",
"django_celery_results",
] + env_apps
if DEBUG:
@@ -168,6 +182,8 @@ ASGI_APPLICATION = "paperless.asgi.application"
STATIC_URL = os.getenv("PAPERLESS_STATIC_URL", BASE_URL + "static/")
WHITENOISE_STATIC_PREFIX = "/static/"
_REDIS_URL = os.getenv("PAPERLESS_REDIS", "redis://localhost:6379")
# TODO: what is this used for?
TEMPLATES = [
{
@@ -189,7 +205,7 @@ CHANNEL_LAYERS = {
"default": {
"BACKEND": "channels_redis.core.RedisChannelLayer",
"CONFIG": {
"hosts": [os.getenv("PAPERLESS_REDIS", "redis://localhost:6379")],
"hosts": [_REDIS_URL],
"capacity": 2000, # default 100
"expiry": 15, # default 60
},
@@ -274,7 +290,7 @@ SECRET_KEY = os.getenv(
AUTH_PASSWORD_VALIDATORS = [
{
"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",
"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator", # noqa: E501
},
{
"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",
@@ -308,6 +324,7 @@ DATABASES = {
"default": {
"ENGINE": "django.db.backends.sqlite3",
"NAME": os.path.join(DATA_DIR, "db.sqlite3"),
"OPTIONS": {},
},
}
@@ -317,16 +334,31 @@ if os.getenv("PAPERLESS_DBHOST"):
DATABASES["sqlite"] = DATABASES["default"].copy()
DATABASES["default"] = {
"ENGINE": "django.db.backends.postgresql_psycopg2",
"HOST": os.getenv("PAPERLESS_DBHOST"),
"NAME": os.getenv("PAPERLESS_DBNAME", "paperless"),
"USER": os.getenv("PAPERLESS_DBUSER", "paperless"),
"PASSWORD": os.getenv("PAPERLESS_DBPASS", "paperless"),
"OPTIONS": {"sslmode": os.getenv("PAPERLESS_DBSSLMODE", "prefer")},
"OPTIONS": {},
}
if os.getenv("PAPERLESS_DBPORT"):
DATABASES["default"]["PORT"] = os.getenv("PAPERLESS_DBPORT")
# Leave room for future extensibility
if os.getenv("PAPERLESS_DBENGINE") == "mariadb":
engine = "django.db.backends.mysql"
options = {"read_default_file": "/etc/mysql/my.cnf", "charset": "utf8mb4"}
else: # Default to PostgresDB
engine = "django.db.backends.postgresql_psycopg2"
options = {"sslmode": os.getenv("PAPERLESS_DBSSLMODE", "prefer")}
DATABASES["default"]["ENGINE"] = engine
DATABASES["default"]["OPTIONS"].update(options)
if os.getenv("PAPERLESS_DB_TIMEOUT") is not None:
DATABASES["default"]["OPTIONS"].update(
{"timeout": float(os.getenv("PAPERLESS_DB_TIMEOUT"))},
)
DEFAULT_AUTO_FIELD = "django.db.models.AutoField"
###############################################################################
@@ -425,47 +457,57 @@ LOGGING = {
# Task queue #
###############################################################################
TASK_WORKERS = __get_int("PAPERLESS_TASK_WORKERS", 1)
# Sensible defaults for multitasking:
# use a fair balance between worker processes and threads epr worker so that
# both consuming many documents in parallel and consuming large documents is
# reasonably fast.
# Favors threads per worker on smaller systems and never exceeds cpu_count()
# in total.
WORKER_TIMEOUT: Final[int] = __get_int("PAPERLESS_WORKER_TIMEOUT", 1800)
CELERY_BROKER_URL = _REDIS_URL
CELERY_TIMEZONE = TIME_ZONE
def default_task_workers() -> int:
# always leave one core open
available_cores = max(multiprocessing.cpu_count(), 1)
try:
if available_cores < 4:
return available_cores
return max(math.floor(math.sqrt(available_cores)), 1)
except NotImplementedError:
return 1
CELERY_WORKER_HIJACK_ROOT_LOGGER = False
CELERY_WORKER_CONCURRENCY = TASK_WORKERS
CELERY_WORKER_MAX_TASKS_PER_CHILD = 1
CELERY_WORKER_SEND_TASK_EVENTS = True
CELERY_SEND_TASK_SENT_EVENT = True
TASK_WORKERS = __get_int("PAPERLESS_TASK_WORKERS", default_task_workers())
CELERY_TASK_TRACK_STARTED = True
CELERY_TASK_TIME_LIMIT = WORKER_TIMEOUT
PAPERLESS_WORKER_TIMEOUT: Final[int] = __get_int("PAPERLESS_WORKER_TIMEOUT", 1800)
CELERY_RESULT_EXTENDED = True
CELERY_RESULT_BACKEND = "django-db"
CELERY_CACHE_BACKEND = "default"
# Per django-q docs, timeout must be smaller than retry
# We default retry to 10s more than the timeout
PAPERLESS_WORKER_RETRY: Final[int] = __get_int(
"PAPERLESS_WORKER_RETRY",
PAPERLESS_WORKER_TIMEOUT + 10,
)
CELERY_BEAT_SCHEDULE = {
# Every ten minutes
"Check all e-mail accounts": {
"task": "paperless_mail.tasks.process_mail_accounts",
"schedule": crontab(minute="*/10"),
},
# Hourly at 5 minutes past the hour
"Train the classifier": {
"task": "documents.tasks.train_classifier",
"schedule": crontab(minute="5", hour="*/1"),
},
# Daily at midnight
"Optimize the index": {
"task": "documents.tasks.index_optimize",
"schedule": crontab(minute=0, hour=0),
},
# Weekly, Sunday at 00:30
"Perform sanity check": {
"task": "documents.tasks.sanity_check",
"schedule": crontab(minute=30, hour=0, day_of_week="sun"),
},
}
CELERY_BEAT_SCHEDULE_FILENAME = os.path.join(DATA_DIR, "celerybeat-schedule.db")
Q_CLUSTER = {
"name": "paperless",
"guard_cycle": 5,
"catch_up": False,
"recycle": 1,
"retry": PAPERLESS_WORKER_RETRY,
"timeout": PAPERLESS_WORKER_TIMEOUT,
"workers": TASK_WORKERS,
"redis": os.getenv("PAPERLESS_REDIS", "redis://localhost:6379"),
"log_level": "DEBUG" if DEBUG else "INFO",
# django setting.
CACHES = {
"default": {
"BACKEND": "django.core.cache.backends.redis.RedisCache",
"LOCATION": _REDIS_URL,
},
}
@@ -509,7 +551,7 @@ CONSUMER_IGNORE_PATTERNS = list(
json.loads(
os.getenv(
"PAPERLESS_CONSUMER_IGNORE_PATTERNS",
'[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"]',
'[".DS_STORE/*", "._*", ".stfolder/*", ".stversions/*", ".localized/*", "desktop.ini"]', # noqa: E501
),
),
)
@@ -533,11 +575,9 @@ OCR_PAGES = int(os.getenv("PAPERLESS_OCR_PAGES", 0))
OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng")
# OCRmyPDF --output-type options are available.
# TODO: validate this setting.
OCR_OUTPUT_TYPE = os.getenv("PAPERLESS_OCR_OUTPUT_TYPE", "pdfa")
# skip. redo, force
# TODO: validate this.
OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip")
OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI")
@@ -590,6 +630,11 @@ POST_CONSUME_SCRIPT = os.getenv("PAPERLESS_POST_CONSUME_SCRIPT")
DATE_ORDER = os.getenv("PAPERLESS_DATE_ORDER", "DMY")
FILENAME_DATE_ORDER = os.getenv("PAPERLESS_FILENAME_DATE_ORDER")
# Maximum number of dates taken from document start to end to show as suggestions for
# `created` date in the frontend. Duplicates are removed, which can result in
# fewer dates shown.
NUMBER_OF_SUGGESTED_DATES = __get_int("PAPERLESS_NUMBER_OF_SUGGESTED_DATES", 3)
# Transformations applied before filename parsing
FILENAME_PARSE_TRANSFORMS = []
for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
@@ -598,7 +643,8 @@ for t in json.loads(os.getenv("PAPERLESS_FILENAME_PARSE_TRANSFORMS", "[]")):
# Specify the filename format for out files
FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")
# If this is enabled, variables in filename format will resolve to empty-string instead of 'none'.
# If this is enabled, variables in filename format will resolve to
# empty-string instead of 'none'.
# Directories with 'empty names' are omitted, too.
FILENAME_FORMAT_REMOVE_NONE = __get_boolean(
"PAPERLESS_FILENAME_FORMAT_REMOVE_NONE",
@@ -610,16 +656,15 @@ THUMBNAIL_FONT_NAME = os.getenv(
"/usr/share/fonts/liberation/LiberationSerif-Regular.ttf",
)
# TODO: this should not have a prefix.
# Tika settings
PAPERLESS_TIKA_ENABLED = __get_boolean("PAPERLESS_TIKA_ENABLED", "NO")
PAPERLESS_TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost:9998")
PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv(
TIKA_ENABLED = __get_boolean("PAPERLESS_TIKA_ENABLED", "NO")
TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT", "http://localhost:9998")
TIKA_GOTENBERG_ENDPOINT = os.getenv(
"PAPERLESS_TIKA_GOTENBERG_ENDPOINT",
"http://localhost:3000",
)
if PAPERLESS_TIKA_ENABLED:
if TIKA_ENABLED:
INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")
@@ -632,8 +677,9 @@ def _parse_ignore_dates(
user provided string(s) into dates
Args:
env_ignore (str): The value of the environment variable, comma seperated dates
date_order (str, optional): The format of the date strings. Defaults to DATE_ORDER.
env_ignore (str): The value of the environment variable, comma separated dates
date_order (str, optional): The format of the date strings.
Defaults to DATE_ORDER.
Returns:
Set[datetime.datetime]: The set of parsed date objects
@@ -662,3 +708,40 @@ if os.getenv("PAPERLESS_IGNORE_DATES") is not None:
ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default")
if ENABLE_UPDATE_CHECK != "default":
ENABLE_UPDATE_CHECK = __get_boolean("PAPERLESS_ENABLE_UPDATE_CHECK")
###############################################################################
# Machine Learning #
###############################################################################
def _get_nltk_language_setting(ocr_lang: str) -> Optional[str]:
"""
Maps an ISO-639-1 language code supported by Tesseract into
an optional NLTK language name. This is the set of common supported
languages for all the NLTK data used.
Assumption: The primary language is first
"""
ocr_lang = ocr_lang.split("+")[0]
iso_code_to_nltk = {
"dan": "danish",
"nld": "dutch",
"eng": "english",
"fin": "finnish",
"fra": "french",
"deu": "german",
"ita": "italian",
"nor": "norwegian",
"por": "portuguese",
"rus": "russian",
"spa": "spanish",
"swe": "swedish",
"tur": "turkish",
}
return iso_code_to_nltk.get(ocr_lang, None)
NLTK_ENABLED: Final[bool] = __get_boolean("PAPERLESS_ENABLE_NLTK", "yes")
NLTK_LANGUAGE: Optional[str] = _get_nltk_language_setting(OCR_LANGUAGE)

View File

@@ -1,12 +1,12 @@
import os
import shutil
from django.test import override_settings
from django.test import TestCase
from documents.tests.utils import DirectoriesMixin
from paperless import binaries_check
from paperless import paths_check
from paperless.checks import binaries_check
from paperless.checks import debug_mode_check
from paperless.checks import paths_check
from paperless.checks import settings_values_check
class TestChecks(DirectoriesMixin, TestCase):
@@ -54,3 +54,89 @@ class TestChecks(DirectoriesMixin, TestCase):
@override_settings(DEBUG=True)
def test_debug_enabled(self):
self.assertEqual(len(debug_mode_check(None)), 1)
class TestSettingsChecks(DirectoriesMixin, TestCase):
def test_all_valid(self):
"""
GIVEN:
- Default settings
WHEN:
- Settings are validated
THEN:
- No system check errors reported
"""
msgs = settings_values_check(None)
self.assertEqual(len(msgs), 0)
@override_settings(OCR_OUTPUT_TYPE="notapdf")
def test_invalid_output_type(self):
"""
GIVEN:
- Default settings
- OCR output type is invalid
WHEN:
- Settings are validated
THEN:
- system check error reported for OCR output type
"""
msgs = settings_values_check(None)
self.assertEqual(len(msgs), 1)
msg = msgs[0]
self.assertIn('OCR output type "notapdf"', msg.msg)
@override_settings(OCR_MODE="makeitso")
def test_invalid_ocr_type(self):
"""
GIVEN:
- Default settings
- OCR type is invalid
WHEN:
- Settings are validated
THEN:
- system check error reported for OCR type
"""
msgs = settings_values_check(None)
self.assertEqual(len(msgs), 1)
msg = msgs[0]
self.assertIn('OCR output mode "makeitso"', msg.msg)
@override_settings(OCR_CLEAN="cleanme")
def test_invalid_ocr_clean(self):
"""
GIVEN:
- Default settings
- OCR cleaning type is invalid
WHEN:
- Settings are validated
THEN:
- system check error reported for OCR cleaning type
"""
msgs = settings_values_check(None)
self.assertEqual(len(msgs), 1)
msg = msgs[0]
self.assertIn('OCR clean mode "cleanme"', msg.msg)
@override_settings(TIME_ZONE="TheMoon\\MyCrater")
def test_invalid_timezone(self):
"""
GIVEN:
- Default settings
- Timezone is invalid
WHEN:
- Settings are validated
THEN:
- system check error reported for timezone
"""
msgs = settings_values_check(None)
self.assertEqual(len(msgs), 1)
msg = msgs[0]
self.assertIn('Timezone "TheMoon\\MyCrater"', msg.msg)

View File

@@ -1,7 +1,9 @@
import datetime
from unittest import mock
from unittest import TestCase
from paperless.settings import _parse_ignore_dates
from paperless.settings import default_threads_per_worker
class TestIgnoreDateParsing(TestCase):
@@ -56,3 +58,27 @@ class TestIgnoreDateParsing(TestCase):
]
self._parse_checker(test_cases)
def test_workers_threads(self):
"""
GIVEN:
- Certain CPU counts
WHEN:
- Threads per worker is calculated
THEN:
- Threads per worker less than or equal to CPU count
- At least 1 thread per worker
"""
default_workers = 1
for i in range(1, 64):
with mock.patch(
"paperless.settings.multiprocessing.cpu_count",
) as cpu_count:
cpu_count.return_value = i
default_threads = default_threads_per_worker(default_workers)
self.assertGreaterEqual(default_threads, 1)
self.assertLessEqual(default_workers * default_threads, i)

View File

@@ -1,7 +1,7 @@
from typing import Final
from typing import Tuple
__version__: Final[Tuple[int, int, int]] = (1, 7, 1)
__version__: Final[Tuple[int, int, int]] = (1, 9, 2)
# Version string like X.Y.Z
__full_version_str__: Final[str] = ".".join(map(str, __version__))
# Version string like X.Y

View File

@@ -1,24 +1,26 @@
import os
import re
import tempfile
from datetime import date
from datetime import timedelta
from fnmatch import fnmatch
from imaplib import IMAP4
from typing import Dict
import magic
import pathvalidate
from django.conf import settings
from django.db import DatabaseError
from django_q.tasks import async_task
from documents.loggers import LoggingMixin
from documents.models import Correspondent
from documents.parsers import is_mime_type_supported
from documents.tasks import consume_file
from imap_tools import AND
from imap_tools import MailBox
from imap_tools import MailboxFolderSelectError
from imap_tools import MailBoxUnencrypted
from imap_tools import MailMessage
from imap_tools import MailMessageFlags
from imap_tools import NOT
from imap_tools.mailbox import MailBoxTls
from paperless_mail.models import MailAccount
from paperless_mail.models import MailRule
@@ -29,7 +31,7 @@ class MailError(Exception):
class BaseMailAction:
def get_criteria(self):
def get_criteria(self) -> Dict:
return {}
def post_consume(self, M, message_uids, parameter):
@@ -67,13 +69,17 @@ class TagMailAction(BaseMailAction):
self.keyword = parameter
def get_criteria(self):
return {"no_keyword": self.keyword}
return {"no_keyword": self.keyword, "gmail_label": self.keyword}
def post_consume(self, M: MailBox, message_uids, parameter):
M.flag(message_uids, [self.keyword], True)
if re.search(r"gmail\.com$|googlemail\.com$", M._host):
for uid in message_uids:
M.client.uid("STORE", uid, "X-GM-LABELS", self.keyword)
else:
M.flag(message_uids, [self.keyword], True)
def get_rule_action(rule):
def get_rule_action(rule) -> BaseMailAction:
if rule.action == MailRule.MailAction.FLAG:
return FlagMailAction()
elif rule.action == MailRule.MailAction.DELETE:
@@ -103,7 +109,7 @@ def make_criterias(rule):
return {**criterias, **get_rule_action(rule).get_criteria()}
def get_mailbox(server, port, security):
def get_mailbox(server, port, security) -> MailBox:
if security == MailAccount.ImapSecurity.NONE:
mailbox = MailBoxUnencrypted(server, port)
elif security == MailAccount.ImapSecurity.STARTTLS:
@@ -162,7 +168,7 @@ class MailAccountHandler(LoggingMixin):
"Unknown correspondent selector",
) # pragma: nocover
def handle_mail_account(self, account):
def handle_mail_account(self, account: MailAccount):
self.renew_logging_group()
@@ -176,33 +182,29 @@ class MailAccountHandler(LoggingMixin):
account.imap_security,
) as M:
supports_gmail_labels = "X-GM-EXT-1" in M.client.capabilities
supports_auth_plain = "AUTH=PLAIN" in M.client.capabilities
self.log("debug", f"GMAIL Label Support: {supports_gmail_labels}")
self.log("debug", f"AUTH=PLAIN Support: {supports_auth_plain}")
try:
M.login(account.username, account.password)
except UnicodeEncodeError:
self.log("debug", "Falling back to AUTH=PLAIN")
try:
# rfc2595 section 6 - PLAIN SASL mechanism
client: IMAP4 = M.client
encoded = (
b"\0"
+ account.username.encode("utf8")
+ b"\0"
+ account.password.encode("utf8")
)
# Assumption is the server supports AUTH=PLAIN capability
# Could check the list with client.capability(), but then what?
# We're failing anyway then
client.authenticate("PLAIN", lambda x: encoded)
# Need to transition out of AUTH state to SELECTED
M.folder.set("INBOX")
except Exception:
try:
M.login_utf8(account.username, account.password)
except Exception as err:
self.log(
"error",
"Unable to authenticate with mail server using AUTH=PLAIN",
)
raise MailError(f"Error while authenticating account {account}")
raise MailError(
f"Error while authenticating account {account}",
) from err
except Exception as e:
self.log(
"error",
@@ -221,7 +223,11 @@ class MailAccountHandler(LoggingMixin):
for rule in account.rules.order_by("order"):
try:
total_processed_files += self.handle_mail_rule(M, rule)
total_processed_files += self.handle_mail_rule(
M,
rule,
supports_gmail_labels,
)
except Exception as e:
self.log(
"error",
@@ -239,13 +245,18 @@ class MailAccountHandler(LoggingMixin):
return total_processed_files
def handle_mail_rule(self, M: MailBox, rule: MailRule):
def handle_mail_rule(
self,
M: MailBox,
rule: MailRule,
supports_gmail_labels: bool = False,
):
self.log("debug", f"Rule {rule}: Selecting folder {rule.folder}")
try:
M.folder.set(rule.folder)
except MailboxFolderSelectError:
except MailboxFolderSelectError as err:
self.log(
"error",
@@ -264,23 +275,38 @@ class MailAccountHandler(LoggingMixin):
raise MailError(
f"Rule {rule}: Folder {rule.folder} "
f"does not exist in account {rule.account}",
)
) from err
criterias = make_criterias(rule)
# Deal with the Gmail label extension
if "gmail_label" in criterias:
gmail_label = criterias["gmail_label"]
del criterias["gmail_label"]
if not supports_gmail_labels:
criterias_imap = AND(**criterias)
else:
criterias_imap = AND(NOT(gmail_label=gmail_label), **criterias)
else:
criterias_imap = AND(**criterias)
self.log(
"debug",
f"Rule {rule}: Searching folder with criteria " f"{str(AND(**criterias))}",
f"Rule {rule}: Searching folder with criteria " f"{str(criterias_imap)}",
)
try:
messages = M.fetch(
criteria=AND(**criterias),
criteria=criterias_imap,
mark_seen=False,
charset=rule.account.character_set,
)
except Exception:
raise MailError(f"Rule {rule}: Error while fetching folder {rule.folder}")
except Exception as err:
raise MailError(
f"Rule {rule}: Error while fetching folder {rule.folder}",
) from err
post_consume_messages = []
@@ -320,7 +346,7 @@ class MailAccountHandler(LoggingMixin):
except Exception as e:
raise MailError(
f"Rule {rule}: Error while processing post-consume actions: " f"{e}",
)
) from e
return total_processed_files
@@ -382,8 +408,7 @@ class MailAccountHandler(LoggingMixin):
f"{message.subject} from {message.from_}",
)
async_task(
"documents.tasks.consume_file",
consume_file.delay(
path=temp_filename,
override_filename=pathvalidate.sanitize_filename(
message.subject + ".eml",
@@ -447,8 +472,7 @@ class MailAccountHandler(LoggingMixin):
f"{message.subject} from {message.from_}",
)
async_task(
"documents.tasks.consume_file",
consume_file.delay(
path=temp_filename,
override_filename=pathvalidate.sanitize_filename(
att.filename,

View File

@@ -2,28 +2,12 @@
from django.db import migrations
from django.db.migrations import RunPython
from django_q.models import Schedule
from django_q.tasks import schedule
def add_schedules(apps, schema_editor):
schedule(
"paperless_mail.tasks.process_mail_accounts",
name="Check all e-mail accounts",
schedule_type=Schedule.MINUTES,
minutes=10,
)
def remove_schedules(apps, schema_editor):
Schedule.objects.filter(func="paperless_mail.tasks.process_mail_accounts").delete()
class Migration(migrations.Migration):
dependencies = [
("paperless_mail", "0001_initial"),
("django_q", "0013_task_attempt_count"),
]
operations = [RunPython(add_schedules, remove_schedules)]
operations = [RunPython(migrations.RunPython.noop, migrations.RunPython.noop)]

View File

@@ -1,13 +1,14 @@
import logging
from celery import shared_task
from paperless_mail.mail import MailAccountHandler
from paperless_mail.mail import MailError
from paperless_mail.models import MailAccount
logger = logging.getLogger("paperless.mail.tasks")
@shared_task
def process_mail_accounts():
total_new_documents = 0
for account in MailAccount.objects.all():
@@ -20,11 +21,3 @@ def process_mail_accounts():
return f"Added {total_new_documents} document(s)."
else:
return "No new documents were added."
def process_mail_account(name):
try:
account = MailAccount.objects.get(name=name)
MailAccountHandler().handle_mail_account(account)
except MailAccount.DoesNotExist:
logger.error(f"Unknown mail acccount: {name}")

View File

@@ -0,0 +1,70 @@
import os
import pytest
from django.test import TestCase
from paperless_mail.mail import MailAccountHandler
from paperless_mail.mail import MailError
from paperless_mail.models import MailAccount
from paperless_mail.models import MailRule
# Only run if the environment is setup
# And the environment is not empty (forks, I think)
@pytest.mark.skipif(
"PAPERLESS_MAIL_TEST_HOST" not in os.environ
or not len(os.environ["PAPERLESS_MAIL_TEST_HOST"]),
reason="Live server testing not enabled",
)
class TestMailLiveServer(TestCase):
def setUp(self) -> None:
self.mail_account_handler = MailAccountHandler()
self.account = MailAccount.objects.create(
name="test",
imap_server=os.environ["PAPERLESS_MAIL_TEST_HOST"],
username=os.environ["PAPERLESS_MAIL_TEST_USER"],
password=os.environ["PAPERLESS_MAIL_TEST_PASSWD"],
imap_port=993,
)
return super().setUp()
def tearDown(self) -> None:
self.account.delete()
return super().tearDown()
def test_process_non_gmail_server_flag(self):
try:
rule1 = MailRule.objects.create(
name="testrule",
account=self.account,
action=MailRule.MailAction.FLAG,
)
self.mail_account_handler.handle_mail_account(self.account)
rule1.delete()
except MailError as e:
self.fail(f"Failure: {e}")
except Exception as e:
pass
def test_process_non_gmail_server_tag(self):
try:
rule2 = MailRule.objects.create(
name="testrule",
account=self.account,
action=MailRule.MailAction.TAG,
)
self.mail_account_handler.handle_mail_account(self.account)
rule2.delete()
except MailError as e:
self.fail(f"Failure: {e}")
except Exception as e:
pass

View File

@@ -20,6 +20,7 @@ from imap_tools import MailboxFolderSelectError
from imap_tools import MailboxLoginError
from imap_tools import MailMessage
from imap_tools import MailMessageFlags
from imap_tools import NOT
from paperless_mail import tasks
from paperless_mail.mail import MailAccountHandler
from paperless_mail.mail import MailError
@@ -46,31 +47,66 @@ class BogusFolderManager:
class BogusClient:
def authenticate(self, mechanism, authobject):
# authobject must be a callable object
auth_bytes = authobject(None)
if auth_bytes != b"\x00admin\x00w57\xc3\xa4\xc3\xb6\xc3\xbcw4b6huwb6nhu":
raise MailboxLoginError("BAD", "OK")
def __init__(self, messages):
self.messages: List[MailMessage] = messages
self.capabilities: List[str] = []
class BogusMailBox(ContextManager):
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
pass
def authenticate(self, mechanism, authobject):
# authobject must be a callable object
auth_bytes = authobject(None)
if auth_bytes != b"\x00admin\x00w57\xc3\xa4\xc3\xb6\xc3\xbcw4b6huwb6nhu":
raise MailboxLoginError("BAD", "OK")
def uid(self, command, *args):
if command == "STORE":
for message in self.messages:
if message.uid == args[0]:
flag = args[2]
if flag == "processed":
message._raw_flag_data.append(f"+FLAGS (processed)".encode())
MailMessage.flags.fget.cache_clear()
class BogusMailBox(ContextManager):
# Common values so tests don't need to remember an accepted login
USERNAME: str = "admin"
ASCII_PASSWORD: str = "secret"
# Note the non-ascii characters here
UTF_PASSWORD: str = "w57äöüw4b6huwb6nhu"
def __init__(self):
self.messages: List[MailMessage] = []
self.messages_spam: List[MailMessage] = []
self.folder = BogusFolderManager()
self.client = BogusClient()
self.client = BogusClient(self.messages)
self._host = ""
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
pass
def updateClient(self):
self.client = BogusClient(self.messages)
def login(self, username, password):
# This will raise a UnicodeEncodeError if the password is not ASCII only
password.encode("ascii")
# Otherwise, check for correct values
if username != "admin" or password not in {"secret"}:
if username != self.USERNAME or password != self.ASCII_PASSWORD:
raise MailboxLoginError("BAD", "OK")
def login_utf8(self, username, password):
# Expected to only be called with the UTF-8 password
if username != self.USERNAME or password != self.UTF_PASSWORD:
raise MailboxLoginError("BAD", "OK")
def fetch(self, criteria, mark_seen, charset=""):
@@ -100,6 +136,9 @@ class BogusMailBox(ContextManager):
tag = criteria[criteria.index("UNKEYWORD") + 1].strip("'")
msg = filter(lambda m: "processed" not in m.flags, msg)
if "(X-GM-LABELS" in criteria: # ['NOT', '(X-GM-LABELS', '"processed"']
msg = filter(lambda m: "processed" not in m.flags, msg)
return list(msg)
def delete(self, uid_list):
@@ -209,7 +248,7 @@ class TestMail(DirectoriesMixin, TestCase):
m.return_value = self.bogus_mailbox
self.addCleanup(patcher.stop)
patcher = mock.patch("paperless_mail.mail.async_task")
patcher = mock.patch("paperless_mail.mail.consume_file.delay")
self.async_task = patcher.start()
self.addCleanup(patcher.stop)
@@ -247,6 +286,7 @@ class TestMail(DirectoriesMixin, TestCase):
seen=False,
),
)
self.bogus_mailbox.updateClient()
def test_get_correspondent(self):
message = namedtuple("MailMessage", [])
@@ -607,6 +647,33 @@ class TestMail(DirectoriesMixin, TestCase):
self.assertEqual(len(self.bogus_mailbox.fetch("UNKEYWORD processed", False)), 0)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
def test_handle_mail_account_tag_gmail(self):
self.bogus_mailbox._host = "imap.gmail.com"
self.bogus_mailbox.client.capabilities = ["X-GM-EXT-1"]
account = MailAccount.objects.create(
name="test",
imap_server="",
username="admin",
password="secret",
)
_ = MailRule.objects.create(
name="testrule",
account=account,
action=MailRule.MailAction.TAG,
action_parameter="processed",
)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
self.assertEqual(self.async_task.call_count, 0)
criteria = NOT(gmail_label="processed")
self.assertEqual(len(self.bogus_mailbox.fetch(criteria, False)), 2)
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(self.async_task.call_count, 2)
self.assertEqual(len(self.bogus_mailbox.fetch(criteria, False)), 0)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
def test_error_login(self):
account = MailAccount.objects.create(
name="test",
@@ -878,9 +945,9 @@ class TestMail(DirectoriesMixin, TestCase):
account = MailAccount.objects.create(
name="test",
imap_server="",
username="admin",
username=BogusMailBox.USERNAME,
# Note the non-ascii characters here
password="w57äöüw4b6huwb6nhu",
password=BogusMailBox.UTF_PASSWORD,
)
_ = MailRule.objects.create(
@@ -910,7 +977,7 @@ class TestMail(DirectoriesMixin, TestCase):
account = MailAccount.objects.create(
name="test",
imap_server="",
username="admin",
username=BogusMailBox.USERNAME,
# Note the non-ascii characters here
# Passes the check in login, not in authenticate
password="réception",
@@ -965,20 +1032,3 @@ class TestTasks(TestCase):
m.side_effect = lambda account: 0
result = tasks.process_mail_accounts()
self.assertIn("No new", result)
@mock.patch("paperless_mail.tasks.MailAccountHandler.handle_mail_account")
def test_single_accounts(self, m):
MailAccount.objects.create(
name="A",
imap_server="A",
username="A",
password="A",
)
tasks.process_mail_account("A")
m.assert_called_once()
m.reset_mock()
tasks.process_mail_account("B")
m.assert_not_called()

View File

@@ -249,16 +249,22 @@ class RasterisedDocumentParser(DocumentParser):
if mime_type == "application/pdf":
text_original = self.extract_text(None, document_path)
original_has_text = text_original and len(text_original) > 50
original_has_text = text_original is not None and len(text_original) > 50
else:
text_original = None
original_has_text = False
# If the original has text, and the user doesn't want an archive,
# we're done here
if settings.OCR_MODE == "skip_noarchive" and original_has_text:
self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
self.text = text_original
return
# Either no text was in the original or there should be an archive
# file created, so OCR the file and create an archive with any
# test located via OCR
import ocrmypdf
from ocrmypdf import InputFileError, EncryptedPdfError
@@ -277,6 +283,7 @@ class RasterisedDocumentParser(DocumentParser):
ocrmypdf.ocr(**args)
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path)
if not self.text:
@@ -323,11 +330,11 @@ class RasterisedDocumentParser(DocumentParser):
except Exception as e:
# If this fails, we have a serious issue at hand.
raise ParseError(f"{e.__class__.__name__}: {str(e)}")
raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e
except Exception as e:
# Anything else is probably serious.
raise ParseError(f"{e.__class__.__name__}: {str(e)}")
raise ParseError(f"{e.__class__.__name__}: {str(e)}") from e
# As a last resort, if we still don't have any text for any reason,
# try to extract the text from the original document.

View File

@@ -341,6 +341,17 @@ class TestParser(DirectoriesMixin, TestCase):
@override_settings(OCR_PAGES=2, OCR_MODE="redo")
def test_multi_page_analog_pages_redo(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR of only pages 1 and 2 requested
- OCR mode set to redo
WHEN:
- Document is parsed
THEN:
- Text of page 1 and 2 extracted
- An archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
@@ -352,6 +363,17 @@ class TestParser(DirectoriesMixin, TestCase):
@override_settings(OCR_PAGES=1, OCR_MODE="force")
def test_multi_page_analog_pages_force(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR of only page 1 requested
- OCR mode set to force
WHEN:
- Document is parsed
THEN:
- Only text of page 1 is extracted
- An archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
@@ -364,6 +386,16 @@ class TestParser(DirectoriesMixin, TestCase):
@override_settings(OCR_MODE="skip_noarchive")
def test_skip_noarchive_withtext(self):
"""
GIVEN:
- File with existing text layer
- OCR mode set to skip_noarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
@@ -377,24 +409,47 @@ class TestParser(DirectoriesMixin, TestCase):
@override_settings(OCR_MODE="skip_noarchive")
def test_skip_noarchive_notext(self):
"""
GIVEN:
- File with text contained in images but no text layer
- OCR mode set to skip_noarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- An archive file is created with the OCRd text
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
"application/pdf",
)
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(
parser.get_text().lower(),
["page 1", "page 2", "page 3"],
)
self.assertIsNotNone(parser.archive_path)
@override_settings(OCR_MODE="skip")
def test_multi_page_mixed(self):
"""
GIVEN:
- File with some text contained in images and some in text layer
- OCR mode set to skip
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- An archive file is created with the OCRd text and the original text
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
"application/pdf",
)
self.assertIsNotNone(parser.archive_path)
self.assertTrue(os.path.isfile(parser.archive_path))
self.assertContainsStrings(
parser.get_text().lower(),
@@ -408,6 +463,16 @@ class TestParser(DirectoriesMixin, TestCase):
@override_settings(OCR_MODE="skip_noarchive")
def test_multi_page_mixed_no_archive(self):
"""
GIVEN:
- File with some text contained in images and some in text layer
- OCR mode set to skip_noarchive
WHEN:
- Document is parsed
THEN:
- Text from images is extracted
- No archive file is created as original file contains text
"""
parser = RasterisedDocumentParser(None)
parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),

View File

@@ -11,5 +11,6 @@ def text_consumer_declaration(sender, **kwargs):
"mime_types": {
"text/plain": ".txt",
"text/csv": ".csv",
"application/csv": ".csv",
},
}

View File

@@ -9,6 +9,6 @@ class PaperlessTikaConfig(AppConfig):
def ready(self):
from documents.signals import document_consumer_declaration
if settings.PAPERLESS_TIKA_ENABLED:
if settings.TIKA_ENABLED:
document_consumer_declaration.connect(tika_consumer_declaration)
AppConfig.ready(self)

View File

@@ -1,4 +1,5 @@
import os
from pathlib import Path
import dateutil.parser
import requests
@@ -27,7 +28,12 @@ class TikaDocumentParser(DocumentParser):
)
def extract_metadata(self, document_path, mime_type):
tika_server = settings.PAPERLESS_TIKA_ENDPOINT
tika_server = settings.TIKA_ENDPOINT
# tika does not support a PathLike, only strings
# ensure this is a string
document_path = str(document_path)
try:
parsed = parser.from_file(document_path, tika_server)
except Exception as e:
@@ -47,9 +53,13 @@ class TikaDocumentParser(DocumentParser):
for key in parsed["metadata"]
]
def parse(self, document_path, mime_type, file_name=None):
def parse(self, document_path: Path, mime_type, file_name=None):
self.log("info", f"Sending {document_path} to Tika server")
tika_server = settings.PAPERLESS_TIKA_ENDPOINT
tika_server = settings.TIKA_ENDPOINT
# tika does not support a PathLike, only strings
# ensure this is a string
document_path = str(document_path)
try:
parsed = parser.from_file(document_path, tika_server)
@@ -57,7 +67,7 @@ class TikaDocumentParser(DocumentParser):
raise ParseError(
f"Could not parse {document_path} with tika server at "
f"{tika_server}: {err}",
)
) from err
self.text = parsed["content"].strip()
@@ -73,7 +83,7 @@ class TikaDocumentParser(DocumentParser):
def convert_to_pdf(self, document_path, file_name):
pdf_path = os.path.join(self.tempdir, "convert.pdf")
gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT
url = gotenberg_server + "/forms/libreoffice/convert"
self.log("info", f"Converting {document_path} to PDF as {pdf_path}")
@@ -90,7 +100,9 @@ class TikaDocumentParser(DocumentParser):
response = requests.post(url, files=files, headers=headers)
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(f"Error while converting document to PDF: {err}")
raise ParseError(
f"Error while converting document to PDF: {err}",
) from err
with open(pdf_path, "wb") as file:
file.write(response.content)

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,78 @@
import datetime
import os
from pathlib import Path
from typing import Final
import pytest
from django.test import TestCase
from paperless_tika.parsers import TikaDocumentParser
@pytest.mark.skipif("TIKA_LIVE" not in os.environ, reason="No tika server")
class TestTikaParserAgainstServer(TestCase):
"""
This test case tests the Tika parsing against a live tika server,
if the environment contains the correct value indicating such a server
is available.
"""
SAMPLE_DIR: Final[Path] = (Path(__file__).parent / Path("samples")).resolve()
def setUp(self) -> None:
self.parser = TikaDocumentParser(logging_group=None)
def tearDown(self) -> None:
self.parser.cleanup()
def test_basic_parse_odt(self):
"""
GIVEN:
- An input ODT format document
WHEN:
- The document is parsed
THEN:
- Document content is correct
- Document date is correct
"""
test_file = self.SAMPLE_DIR / Path("sample.odt")
self.parser.parse(test_file, "application/vnd.oasis.opendocument.text")
self.assertEqual(
self.parser.text,
"This is an ODT test document, created September 14, 2022",
)
self.assertIsNotNone(self.parser.archive_path)
with open(self.parser.archive_path, "rb") as f:
# PDFs begin with the bytes PDF-x.y
self.assertTrue(b"PDF-" in f.read()[:10])
# TODO: Unsure what can set the Creation-Date field in a document, enable when possible
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))
def test_basic_parse_docx(self):
"""
GIVEN:
- An input DOCX format document
WHEN:
- The document is parsed
THEN:
- Document content is correct
- Document date is correct
"""
test_file = self.SAMPLE_DIR / Path("sample.docx")
self.parser.parse(
test_file,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)
self.assertEqual(
self.parser.text,
"This is an DOCX test document, also made September 14, 2022",
)
self.assertIsNotNone(self.parser.archive_path)
with open(self.parser.archive_path, "rb") as f:
self.assertTrue(b"PDF-" in f.read()[:10])
# self.assertEqual(self.parser.date, datetime.datetime(2022, 9, 14))

View File

@@ -1,5 +1,5 @@
[flake8]
extend-exclude = */migrations/*, paperless/settings.py, */tests/*
extend-exclude = */migrations/*, */tests/*
# E203 - https://www.flake8rules.com/rules/E203.html
# W503 - https://www.flake8rules.com/rules/W503.html
ignore = E203,W503