mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Merge branch 'dev' into feature-permissions
This commit is contained in:
@@ -19,7 +19,7 @@ from watchdog.observers.polling import PollingObserver
|
||||
|
||||
try:
|
||||
from inotifyrecursive import INotify, flags
|
||||
except ImportError:
|
||||
except ImportError: # pragma: nocover
|
||||
INotify = flags = None
|
||||
|
||||
logger = logging.getLogger("paperless.management.consumer")
|
||||
|
@@ -4,6 +4,9 @@ import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from typing import Set
|
||||
|
||||
import tqdm
|
||||
from django.conf import settings
|
||||
@@ -96,16 +99,16 @@ class Command(BaseCommand):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
BaseCommand.__init__(self, *args, **kwargs)
|
||||
self.target = None
|
||||
self.files_in_export_dir = []
|
||||
self.exported_files = []
|
||||
self.target: Path = None
|
||||
self.files_in_export_dir: Set[Path] = set()
|
||||
self.exported_files: List[Path] = []
|
||||
self.compare_checksums = False
|
||||
self.use_filename_format = False
|
||||
self.delete = False
|
||||
|
||||
def handle(self, *args, **options):
|
||||
|
||||
self.target = options["target"]
|
||||
self.target = Path(options["target"]).resolve()
|
||||
self.compare_checksums = options["compare_checksums"]
|
||||
self.use_filename_format = options["use_filename_format"]
|
||||
self.delete = options["delete"]
|
||||
@@ -121,11 +124,14 @@ class Command(BaseCommand):
|
||||
dir=settings.SCRATCH_DIR,
|
||||
prefix="paperless-export",
|
||||
)
|
||||
self.target = temp_dir.name
|
||||
self.target = Path(temp_dir.name).resolve()
|
||||
|
||||
if not os.path.exists(self.target):
|
||||
if not self.target.exists():
|
||||
raise CommandError("That path doesn't exist")
|
||||
|
||||
if not self.target.is_dir():
|
||||
raise CommandError("That path isn't a directory")
|
||||
|
||||
if not os.access(self.target, os.W_OK):
|
||||
raise CommandError("That path doesn't appear to be writable")
|
||||
|
||||
@@ -152,10 +158,9 @@ class Command(BaseCommand):
|
||||
|
||||
def dump(self, progress_bar_disable=False):
|
||||
# 1. Take a snapshot of what files exist in the current export folder
|
||||
for root, dirs, files in os.walk(self.target):
|
||||
self.files_in_export_dir.extend(
|
||||
map(lambda f: os.path.abspath(os.path.join(root, f)), files),
|
||||
)
|
||||
for x in self.target.glob("**/*"):
|
||||
if x.is_file():
|
||||
self.files_in_export_dir.add(x.resolve())
|
||||
|
||||
# 2. Create manifest, containing all correspondents, types, tags, storage paths
|
||||
# comments, documents and ui_settings
|
||||
@@ -238,16 +243,16 @@ class Command(BaseCommand):
|
||||
|
||||
# 3.3. write filenames into manifest
|
||||
original_name = base_name
|
||||
original_target = os.path.join(self.target, original_name)
|
||||
original_target = (self.target / Path(original_name)).resolve()
|
||||
document_dict[EXPORTER_FILE_NAME] = original_name
|
||||
|
||||
thumbnail_name = base_name + "-thumbnail.webp"
|
||||
thumbnail_target = os.path.join(self.target, thumbnail_name)
|
||||
thumbnail_target = (self.target / Path(thumbnail_name)).resolve()
|
||||
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
|
||||
|
||||
if document.has_archive_version:
|
||||
archive_name = base_name + "-archive.pdf"
|
||||
archive_target = os.path.join(self.target, archive_name)
|
||||
archive_target = (self.target / Path(archive_name)).resolve()
|
||||
document_dict[EXPORTER_ARCHIVE_NAME] = archive_name
|
||||
else:
|
||||
archive_target = None
|
||||
@@ -256,24 +261,21 @@ class Command(BaseCommand):
|
||||
t = int(time.mktime(document.created.timetuple()))
|
||||
if document.storage_type == Document.STORAGE_TYPE_GPG:
|
||||
|
||||
os.makedirs(os.path.dirname(original_target), exist_ok=True)
|
||||
with open(original_target, "wb") as f:
|
||||
with document.source_file as out_file:
|
||||
f.write(GnuPG.decrypted(out_file))
|
||||
os.utime(original_target, times=(t, t))
|
||||
original_target.parent.mkdir(parents=True, exist_ok=True)
|
||||
with document.source_file as out_file:
|
||||
original_target.write_bytes(GnuPG.decrypted(out_file))
|
||||
os.utime(original_target, times=(t, t))
|
||||
|
||||
os.makedirs(os.path.dirname(thumbnail_target), exist_ok=True)
|
||||
with open(thumbnail_target, "wb") as f:
|
||||
with document.thumbnail_file as out_file:
|
||||
f.write(GnuPG.decrypted(out_file))
|
||||
os.utime(thumbnail_target, times=(t, t))
|
||||
thumbnail_target.parent.mkdir(parents=True, exist_ok=True)
|
||||
with document.thumbnail_file as out_file:
|
||||
thumbnail_target.write_bytes(GnuPG.decrypted(out_file))
|
||||
os.utime(thumbnail_target, times=(t, t))
|
||||
|
||||
if archive_target:
|
||||
os.makedirs(os.path.dirname(archive_target), exist_ok=True)
|
||||
with open(archive_target, "wb") as f:
|
||||
with document.archive_path as out_file:
|
||||
f.write(GnuPG.decrypted(out_file))
|
||||
os.utime(archive_target, times=(t, t))
|
||||
archive_target.parent.mkdir(parents=True, exist_ok=True)
|
||||
with document.archive_path as out_file:
|
||||
archive_target.write_bytes(GnuPG.decrypted(out_file))
|
||||
os.utime(archive_target, times=(t, t))
|
||||
else:
|
||||
self.check_and_copy(
|
||||
document.source_path,
|
||||
@@ -291,16 +293,14 @@ class Command(BaseCommand):
|
||||
)
|
||||
|
||||
# 4.1 write manifest to target folder
|
||||
manifest_path = os.path.abspath(os.path.join(self.target, "manifest.json"))
|
||||
|
||||
with open(manifest_path, "w") as f:
|
||||
json.dump(manifest, f, indent=2)
|
||||
manifest_path = (self.target / Path("manifest.json")).resolve()
|
||||
manifest_path.write_text(json.dumps(manifest, indent=2))
|
||||
|
||||
# 4.2 write version information to target folder
|
||||
version_path = os.path.abspath(os.path.join(self.target, "version.json"))
|
||||
|
||||
with open(version_path, "w") as f:
|
||||
json.dump({"version": version.__full_version_str__}, f, indent=2)
|
||||
version_path = (self.target / Path("version.json")).resolve()
|
||||
version_path.write_text(
|
||||
json.dumps({"version": version.__full_version_str__}, indent=2),
|
||||
)
|
||||
|
||||
if self.delete:
|
||||
# 5. Remove files which we did not explicitly export in this run
|
||||
@@ -309,25 +309,24 @@ class Command(BaseCommand):
|
||||
self.files_in_export_dir.remove(manifest_path)
|
||||
|
||||
for f in self.files_in_export_dir:
|
||||
os.remove(f)
|
||||
f.unlink()
|
||||
|
||||
delete_empty_directories(
|
||||
os.path.abspath(os.path.dirname(f)),
|
||||
os.path.abspath(self.target),
|
||||
f.parent,
|
||||
self.target,
|
||||
)
|
||||
|
||||
def check_and_copy(self, source, source_checksum, target):
|
||||
if os.path.abspath(target) in self.files_in_export_dir:
|
||||
self.files_in_export_dir.remove(os.path.abspath(target))
|
||||
def check_and_copy(self, source, source_checksum, target: Path):
|
||||
if target in self.files_in_export_dir:
|
||||
self.files_in_export_dir.remove(target)
|
||||
|
||||
perform_copy = False
|
||||
|
||||
if os.path.exists(target):
|
||||
if target.exists():
|
||||
source_stat = os.stat(source)
|
||||
target_stat = os.stat(target)
|
||||
target_stat = target.stat()
|
||||
if self.compare_checksums and source_checksum:
|
||||
with open(target, "rb") as f:
|
||||
target_checksum = hashlib.md5(f.read()).hexdigest()
|
||||
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
|
||||
perform_copy = target_checksum != source_checksum
|
||||
elif source_stat.st_mtime != target_stat.st_mtime:
|
||||
perform_copy = True
|
||||
@@ -338,5 +337,5 @@ class Command(BaseCommand):
|
||||
perform_copy = True
|
||||
|
||||
if perform_copy:
|
||||
os.makedirs(os.path.dirname(target), exist_ok=True)
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
shutil.copy2(source, target)
|
||||
|
@@ -447,7 +447,7 @@ def update_filename_and_move_files(sender, instance, **kwargs):
|
||||
)
|
||||
|
||||
except (OSError, DatabaseError, CannotMoveFilesException) as e:
|
||||
logger.warn(f"Exception during file handling: {e}")
|
||||
logger.warning(f"Exception during file handling: {e}")
|
||||
# This happens when either:
|
||||
# - moving the files failed due to file system errors
|
||||
# - saving to the database failed due to database errors
|
||||
|
@@ -3,10 +3,10 @@ import logging
|
||||
import os
|
||||
import shutil
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Type
|
||||
|
||||
import dateutil.parser
|
||||
import tqdm
|
||||
from asgiref.sync import async_to_sync
|
||||
from celery import shared_task
|
||||
@@ -107,7 +107,7 @@ def consume_file(
|
||||
# More types will be retained through JSON encode/decode
|
||||
if override_created is not None and isinstance(override_created, str):
|
||||
try:
|
||||
override_created = datetime.fromisoformat(override_created)
|
||||
override_created = dateutil.parser.isoparse(override_created)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
@@ -7,6 +7,7 @@ import tempfile
|
||||
import urllib.request
|
||||
import uuid
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from unittest import mock
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
@@ -812,7 +813,9 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
|
||||
m.assert_called_once()
|
||||
|
||||
args, kwargs = m.call_args
|
||||
self.assertEqual(kwargs["override_filename"], "simple.pdf")
|
||||
file_path = Path(args[0])
|
||||
self.assertEqual(file_path.name, "simple.pdf")
|
||||
self.assertIn(Path(settings.SCRATCH_DIR), file_path.parents)
|
||||
self.assertIsNone(kwargs["override_title"])
|
||||
self.assertIsNone(kwargs["override_correspondent_id"])
|
||||
self.assertIsNone(kwargs["override_document_type_id"])
|
||||
@@ -837,7 +840,9 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
|
||||
m.assert_called_once()
|
||||
|
||||
args, kwargs = m.call_args
|
||||
self.assertEqual(kwargs["override_filename"], "simple.pdf")
|
||||
file_path = Path(args[0])
|
||||
self.assertEqual(file_path.name, "simple.pdf")
|
||||
self.assertIn(Path(settings.SCRATCH_DIR), file_path.parents)
|
||||
self.assertIsNone(kwargs["override_title"])
|
||||
self.assertIsNone(kwargs["override_correspondent_id"])
|
||||
self.assertIsNone(kwargs["override_document_type_id"])
|
||||
|
@@ -8,6 +8,7 @@ from unittest import mock
|
||||
from zipfile import ZipFile
|
||||
|
||||
from django.core.management import call_command
|
||||
from django.core.management.base import CommandError
|
||||
from django.test import override_settings
|
||||
from django.test import TestCase
|
||||
from django.utils import timezone
|
||||
@@ -438,3 +439,61 @@ class TestExportImport(DirectoriesMixin, TestCase):
|
||||
self.assertEqual(len(zip.namelist()), 14)
|
||||
self.assertIn("manifest.json", zip.namelist())
|
||||
self.assertIn("version.json", zip.namelist())
|
||||
|
||||
def test_export_target_not_exists(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Request to export documents to directory that doesn't exist
|
||||
WHEN:
|
||||
- Export command is called
|
||||
THEN:
|
||||
- Error is raised
|
||||
"""
|
||||
args = ["document_exporter", "/tmp/foo/bar"]
|
||||
|
||||
with self.assertRaises(CommandError) as e:
|
||||
|
||||
call_command(*args)
|
||||
|
||||
self.assertEqual("That path isn't a directory", str(e))
|
||||
|
||||
def test_export_target_exists_but_is_file(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Request to export documents to file instead of directory
|
||||
WHEN:
|
||||
- Export command is called
|
||||
THEN:
|
||||
- Error is raised
|
||||
"""
|
||||
|
||||
with tempfile.NamedTemporaryFile() as tmp_file:
|
||||
|
||||
args = ["document_exporter", tmp_file.name]
|
||||
|
||||
with self.assertRaises(CommandError) as e:
|
||||
|
||||
call_command(*args)
|
||||
|
||||
self.assertEqual("That path isn't a directory", str(e))
|
||||
|
||||
def test_export_target_not_writable(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Request to export documents to directory that's not writeable
|
||||
WHEN:
|
||||
- Export command is called
|
||||
THEN:
|
||||
- Error is raised
|
||||
"""
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
|
||||
os.chmod(tmp_dir, 0o000)
|
||||
|
||||
args = ["document_exporter", tmp_dir]
|
||||
|
||||
with self.assertRaises(CommandError) as e:
|
||||
|
||||
call_command(*args)
|
||||
|
||||
self.assertEqual("That path doesn't appear to be writable", str(e))
|
||||
|
@@ -8,10 +8,12 @@ import urllib
|
||||
import uuid
|
||||
import zipfile
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from time import mktime
|
||||
from unicodedata import normalize
|
||||
from urllib.parse import quote
|
||||
|
||||
import pathvalidate
|
||||
from django.conf import settings
|
||||
from django.contrib.auth.models import User
|
||||
from django.db.models import Case
|
||||
@@ -33,6 +35,7 @@ from documents.filters import ObjectOwnedOrGrandtedPermissionsFilter
|
||||
from documents.permissions import PaperlessAdminPermissions
|
||||
from documents.permissions import PaperlessObjectPermissions
|
||||
from documents.tasks import consume_file
|
||||
from langdetect import detect
|
||||
from packaging import version as packaging_version
|
||||
from paperless import version
|
||||
from paperless.db import GnuPG
|
||||
@@ -361,6 +364,13 @@ class DocumentViewSet(
|
||||
"original_filename": doc.original_filename,
|
||||
}
|
||||
|
||||
lang = "en"
|
||||
try:
|
||||
lang = detect(doc.content)
|
||||
except Exception:
|
||||
pass
|
||||
meta["lang"] = lang
|
||||
|
||||
if doc.has_archive_version:
|
||||
meta["archive_size"] = self.get_filesize(doc.archive_path)
|
||||
meta["archive_metadata"] = self.get_metadata(
|
||||
@@ -658,20 +668,19 @@ class PostDocumentView(GenericAPIView):
|
||||
|
||||
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
|
||||
|
||||
with tempfile.NamedTemporaryFile(
|
||||
prefix="paperless-upload-",
|
||||
dir=settings.SCRATCH_DIR,
|
||||
delete=False,
|
||||
) as f:
|
||||
f.write(doc_data)
|
||||
os.utime(f.name, times=(t, t))
|
||||
temp_filename = f.name
|
||||
temp_file_path = Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR)) / Path(
|
||||
pathvalidate.sanitize_filename(doc_name),
|
||||
)
|
||||
|
||||
temp_file_path.write_bytes(doc_data)
|
||||
|
||||
os.utime(temp_file_path, times=(t, t))
|
||||
|
||||
task_id = str(uuid.uuid4())
|
||||
|
||||
async_task = consume_file.delay(
|
||||
temp_filename,
|
||||
override_filename=doc_name,
|
||||
# Paths are not JSON friendly
|
||||
str(temp_file_path),
|
||||
override_title=title,
|
||||
override_correspondent_id=correspondent_id,
|
||||
override_document_type_id=document_type_id,
|
||||
|
@@ -5,6 +5,7 @@ import multiprocessing
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
from typing import Dict
|
||||
from typing import Final
|
||||
from typing import Optional
|
||||
from typing import Set
|
||||
@@ -107,6 +108,57 @@ def _parse_redis_url(env_redis: Optional[str]) -> Tuple[str]:
|
||||
return (env_redis, env_redis)
|
||||
|
||||
|
||||
def _parse_beat_schedule() -> Dict:
|
||||
schedule = {}
|
||||
tasks = [
|
||||
{
|
||||
"name": "Check all e-mail accounts",
|
||||
"env_key": "PAPERLESS_EMAIL_TASK_CRON",
|
||||
# Default every ten minutes
|
||||
"env_default": "*/10 * * * *",
|
||||
"task": "paperless_mail.tasks.process_mail_accounts",
|
||||
},
|
||||
{
|
||||
"name": "Train the classifier",
|
||||
"env_key": "PAPERLESS_TRAIN_TASK_CRON",
|
||||
# Default hourly at 5 minutes past the hour
|
||||
"env_default": "5 */1 * * *",
|
||||
"task": "documents.tasks.train_classifier",
|
||||
},
|
||||
{
|
||||
"name": "Optimize the index",
|
||||
"env_key": "PAPERLESS_INDEX_TASK_CRON",
|
||||
# Default daily at midnight
|
||||
"env_default": "0 0 * * *",
|
||||
"task": "documents.tasks.index_optimize",
|
||||
},
|
||||
{
|
||||
"name": "Perform sanity check",
|
||||
"env_key": "PAPERLESS_SANITY_TASK_CRON",
|
||||
# Default Sunday at 00:30
|
||||
"env_default": "30 0 * * sun",
|
||||
"task": "documents.tasks.sanity_check",
|
||||
},
|
||||
]
|
||||
for task in tasks:
|
||||
# Either get the environment setting or use the default
|
||||
value = os.getenv(task["env_key"], task["env_default"])
|
||||
# Don't add disabled tasks to the schedule
|
||||
if value == "disable":
|
||||
continue
|
||||
# I find https://crontab.guru/ super helpful
|
||||
# crontab(5) format
|
||||
# - five time-and-date fields
|
||||
# - separated by at least one blank
|
||||
minute, hour, day_month, month, day_week = value.split(" ")
|
||||
schedule[task["name"]] = {
|
||||
"task": task["task"],
|
||||
"schedule": crontab(minute, hour, day_week, day_month, month),
|
||||
}
|
||||
|
||||
return schedule
|
||||
|
||||
|
||||
# NEVER RUN WITH DEBUG IN PRODUCTION.
|
||||
DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO")
|
||||
|
||||
@@ -126,7 +178,7 @@ THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails")
|
||||
|
||||
DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data"))
|
||||
|
||||
NLTK_DIR = __get_path("PAPERLESS_NLTK_DIR", "/usr/local/share/nltk_data")
|
||||
NLTK_DIR = __get_path("PAPERLESS_NLTK_DIR", "/usr/share/nltk_data")
|
||||
|
||||
TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR")
|
||||
|
||||
@@ -533,29 +585,10 @@ CELERY_RESULT_EXTENDED = True
|
||||
CELERY_RESULT_BACKEND = "django-db"
|
||||
CELERY_CACHE_BACKEND = "default"
|
||||
|
||||
# https://docs.celeryq.dev/en/stable/userguide/configuration.html#beat-schedule
|
||||
CELERY_BEAT_SCHEDULE = _parse_beat_schedule()
|
||||
|
||||
CELERY_BEAT_SCHEDULE = {
|
||||
# Every ten minutes
|
||||
"Check all e-mail accounts": {
|
||||
"task": "paperless_mail.tasks.process_mail_accounts",
|
||||
"schedule": crontab(minute="*/10"),
|
||||
},
|
||||
# Hourly at 5 minutes past the hour
|
||||
"Train the classifier": {
|
||||
"task": "documents.tasks.train_classifier",
|
||||
"schedule": crontab(minute="5", hour="*/1"),
|
||||
},
|
||||
# Daily at midnight
|
||||
"Optimize the index": {
|
||||
"task": "documents.tasks.index_optimize",
|
||||
"schedule": crontab(minute=0, hour=0),
|
||||
},
|
||||
# Weekly, Sunday at 00:30
|
||||
"Perform sanity check": {
|
||||
"task": "documents.tasks.sanity_check",
|
||||
"schedule": crontab(minute=30, hour=0, day_of_week="sun"),
|
||||
},
|
||||
}
|
||||
# https://docs.celeryq.dev/en/stable/userguide/configuration.html#beat-schedule-filename
|
||||
CELERY_BEAT_SCHEDULE_FILENAME = os.path.join(DATA_DIR, "celerybeat-schedule.db")
|
||||
|
||||
# django setting.
|
||||
|
@@ -1,7 +1,10 @@
|
||||
import datetime
|
||||
import os
|
||||
from unittest import mock
|
||||
from unittest import TestCase
|
||||
|
||||
from celery.schedules import crontab
|
||||
from paperless.settings import _parse_beat_schedule
|
||||
from paperless.settings import _parse_ignore_dates
|
||||
from paperless.settings import _parse_redis_url
|
||||
from paperless.settings import default_threads_per_worker
|
||||
@@ -60,6 +63,8 @@ class TestIgnoreDateParsing(TestCase):
|
||||
|
||||
self._parse_checker(test_cases)
|
||||
|
||||
|
||||
class TestThreadCalculation(TestCase):
|
||||
def test_workers_threads(self):
|
||||
"""
|
||||
GIVEN:
|
||||
@@ -84,6 +89,8 @@ class TestIgnoreDateParsing(TestCase):
|
||||
|
||||
self.assertLessEqual(default_workers * default_threads, i)
|
||||
|
||||
|
||||
class TestRedisSocketConversion(TestCase):
|
||||
def test_redis_socket_parsing(self):
|
||||
"""
|
||||
GIVEN:
|
||||
@@ -139,3 +146,132 @@ class TestIgnoreDateParsing(TestCase):
|
||||
]:
|
||||
result = _parse_redis_url(input)
|
||||
self.assertTupleEqual(expected, result)
|
||||
|
||||
|
||||
class TestCeleryScheduleParsing(TestCase):
|
||||
def test_schedule_configuration_default(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- No configured task schedules
|
||||
WHEN:
|
||||
- The celery beat schedule is built
|
||||
THEN:
|
||||
- The default schedule is returned
|
||||
"""
|
||||
schedule = _parse_beat_schedule()
|
||||
|
||||
self.assertDictEqual(
|
||||
{
|
||||
"Check all e-mail accounts": {
|
||||
"task": "paperless_mail.tasks.process_mail_accounts",
|
||||
"schedule": crontab(minute="*/10"),
|
||||
},
|
||||
"Train the classifier": {
|
||||
"task": "documents.tasks.train_classifier",
|
||||
"schedule": crontab(minute="5", hour="*/1"),
|
||||
},
|
||||
"Optimize the index": {
|
||||
"task": "documents.tasks.index_optimize",
|
||||
"schedule": crontab(minute=0, hour=0),
|
||||
},
|
||||
"Perform sanity check": {
|
||||
"task": "documents.tasks.sanity_check",
|
||||
"schedule": crontab(minute=30, hour=0, day_of_week="sun"),
|
||||
},
|
||||
},
|
||||
schedule,
|
||||
)
|
||||
|
||||
def test_schedule_configuration_changed(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Email task is configured non-default
|
||||
WHEN:
|
||||
- The celery beat schedule is built
|
||||
THEN:
|
||||
- The email task is configured per environment
|
||||
- The default schedule is returned for other tasks
|
||||
"""
|
||||
with mock.patch.dict(
|
||||
os.environ,
|
||||
{"PAPERLESS_EMAIL_TASK_CRON": "*/50 * * * mon"},
|
||||
):
|
||||
schedule = _parse_beat_schedule()
|
||||
|
||||
self.assertDictEqual(
|
||||
{
|
||||
"Check all e-mail accounts": {
|
||||
"task": "paperless_mail.tasks.process_mail_accounts",
|
||||
"schedule": crontab(minute="*/50", day_of_week="mon"),
|
||||
},
|
||||
"Train the classifier": {
|
||||
"task": "documents.tasks.train_classifier",
|
||||
"schedule": crontab(minute="5", hour="*/1"),
|
||||
},
|
||||
"Optimize the index": {
|
||||
"task": "documents.tasks.index_optimize",
|
||||
"schedule": crontab(minute=0, hour=0),
|
||||
},
|
||||
"Perform sanity check": {
|
||||
"task": "documents.tasks.sanity_check",
|
||||
"schedule": crontab(minute=30, hour=0, day_of_week="sun"),
|
||||
},
|
||||
},
|
||||
schedule,
|
||||
)
|
||||
|
||||
def test_schedule_configuration_disabled(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- Search index task is disabled
|
||||
WHEN:
|
||||
- The celery beat schedule is built
|
||||
THEN:
|
||||
- The search index task is not present
|
||||
- The default schedule is returned for other tasks
|
||||
"""
|
||||
with mock.patch.dict(os.environ, {"PAPERLESS_INDEX_TASK_CRON": "disable"}):
|
||||
schedule = _parse_beat_schedule()
|
||||
|
||||
self.assertDictEqual(
|
||||
{
|
||||
"Check all e-mail accounts": {
|
||||
"task": "paperless_mail.tasks.process_mail_accounts",
|
||||
"schedule": crontab(minute="*/10"),
|
||||
},
|
||||
"Train the classifier": {
|
||||
"task": "documents.tasks.train_classifier",
|
||||
"schedule": crontab(minute="5", hour="*/1"),
|
||||
},
|
||||
"Perform sanity check": {
|
||||
"task": "documents.tasks.sanity_check",
|
||||
"schedule": crontab(minute=30, hour=0, day_of_week="sun"),
|
||||
},
|
||||
},
|
||||
schedule,
|
||||
)
|
||||
|
||||
def test_schedule_configuration_disabled_all(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- All tasks are disabled
|
||||
WHEN:
|
||||
- The celery beat schedule is built
|
||||
THEN:
|
||||
- No tasks are scheduled
|
||||
"""
|
||||
with mock.patch.dict(
|
||||
os.environ,
|
||||
{
|
||||
"PAPERLESS_EMAIL_TASK_CRON": "disable",
|
||||
"PAPERLESS_TRAIN_TASK_CRON": "disable",
|
||||
"PAPERLESS_SANITY_TASK_CRON": "disable",
|
||||
"PAPERLESS_INDEX_TASK_CRON": "disable",
|
||||
},
|
||||
):
|
||||
schedule = _parse_beat_schedule()
|
||||
|
||||
self.assertDictEqual(
|
||||
{},
|
||||
schedule,
|
||||
)
|
||||
|
@@ -158,7 +158,7 @@ urlpatterns = [
|
||||
|
||||
|
||||
websocket_urlpatterns = [
|
||||
re_path(r"ws/status/$", StatusConsumer.as_asgi()),
|
||||
path(settings.BASE_URL.lstrip("/") + "ws/status/", StatusConsumer.as_asgi()),
|
||||
]
|
||||
|
||||
# Text in each page's <h1> (and above login form).
|
||||
|
@@ -1,3 +1,4 @@
|
||||
import itertools
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
@@ -25,6 +26,28 @@ from imap_tools.mailbox import MailBoxTls
|
||||
from paperless_mail.models import MailAccount
|
||||
from paperless_mail.models import MailRule
|
||||
|
||||
# Apple Mail sets multiple IMAP KEYWORD and the general "\Flagged" FLAG
|
||||
# imaplib => conn.fetch(b"<message_id>", "FLAGS")
|
||||
|
||||
# no flag - (FLAGS (\\Seen $NotJunk NotJunk))'
|
||||
# red - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk))'
|
||||
# orange - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk $MailFlagBit0))'
|
||||
# yellow - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk $MailFlagBit1))'
|
||||
# blue - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk $MailFlagBit2))'
|
||||
# green - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk $MailFlagBit0 $MailFlagBit1))'
|
||||
# violet - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk $MailFlagBit0 $MailFlagBit2))'
|
||||
# grey - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk $MailFlagBit1 $MailFlagBit2))'
|
||||
|
||||
APPLE_MAIL_TAG_COLORS = {
|
||||
"red": [],
|
||||
"orange": ["$MailFlagBit0"],
|
||||
"yellow": ["$MailFlagBit1"],
|
||||
"blue": ["$MailFlagBit2"],
|
||||
"green": ["$MailFlagBit0", "$MailFlagBit1"],
|
||||
"violet": ["$MailFlagBit0", "$MailFlagBit2"],
|
||||
"grey": ["$MailFlagBit1", "$MailFlagBit2"],
|
||||
}
|
||||
|
||||
|
||||
class MailError(Exception):
|
||||
pass
|
||||
@@ -66,18 +89,59 @@ class FlagMailAction(BaseMailAction):
|
||||
|
||||
class TagMailAction(BaseMailAction):
|
||||
def __init__(self, parameter):
|
||||
self.keyword = parameter
|
||||
|
||||
# The custom tag should look like "apple:<color>"
|
||||
if "apple:" in parameter.lower():
|
||||
|
||||
_, self.color = parameter.split(":")
|
||||
self.color = self.color.strip()
|
||||
|
||||
if not self.color.lower() in APPLE_MAIL_TAG_COLORS.keys():
|
||||
raise MailError("Not a valid AppleMail tag color.")
|
||||
|
||||
self.keyword = None
|
||||
|
||||
else:
|
||||
self.keyword = parameter
|
||||
self.color = None
|
||||
|
||||
def get_criteria(self):
|
||||
|
||||
# AppleMail: We only need to check if mails are \Flagged
|
||||
if self.color:
|
||||
return {"flagged": False}
|
||||
|
||||
return {"no_keyword": self.keyword, "gmail_label": self.keyword}
|
||||
|
||||
def post_consume(self, M: MailBox, message_uids, parameter):
|
||||
if re.search(r"gmail\.com$|googlemail\.com$", M._host):
|
||||
for uid in message_uids:
|
||||
M.client.uid("STORE", uid, "X-GM-LABELS", self.keyword)
|
||||
else:
|
||||
|
||||
# AppleMail
|
||||
elif self.color:
|
||||
|
||||
# Remove all existing $MailFlagBits
|
||||
M.flag(
|
||||
message_uids,
|
||||
set(itertools.chain(*APPLE_MAIL_TAG_COLORS.values())),
|
||||
False,
|
||||
)
|
||||
|
||||
# Set new $MailFlagBits
|
||||
M.flag(message_uids, APPLE_MAIL_TAG_COLORS.get(self.color), True)
|
||||
|
||||
# Set the general \Flagged
|
||||
# This defaults to the "red" flag in AppleMail and
|
||||
# "stars" in Thunderbird or GMail
|
||||
M.flag(message_uids, [MailMessageFlags.FLAGGED], True)
|
||||
|
||||
elif self.keyword:
|
||||
M.flag(message_uids, [self.keyword], True)
|
||||
|
||||
else:
|
||||
raise MailError("No keyword specified.")
|
||||
|
||||
|
||||
def get_rule_action(rule) -> BaseMailAction:
|
||||
if rule.action == MailRule.MailAction.FLAG:
|
||||
@@ -197,14 +261,14 @@ class MailAccountHandler(LoggingMixin):
|
||||
|
||||
try:
|
||||
M.login_utf8(account.username, account.password)
|
||||
except Exception as err:
|
||||
except Exception as e:
|
||||
self.log(
|
||||
"error",
|
||||
"Unable to authenticate with mail server using AUTH=PLAIN",
|
||||
)
|
||||
raise MailError(
|
||||
f"Error while authenticating account {account}",
|
||||
) from err
|
||||
) from e
|
||||
except Exception as e:
|
||||
self.log(
|
||||
"error",
|
||||
|
@@ -24,6 +24,7 @@ from imap_tools import NOT
|
||||
from paperless_mail import tasks
|
||||
from paperless_mail.mail import MailAccountHandler
|
||||
from paperless_mail.mail import MailError
|
||||
from paperless_mail.mail import TagMailAction
|
||||
from paperless_mail.models import MailAccount
|
||||
from paperless_mail.models import MailRule
|
||||
|
||||
@@ -674,6 +675,39 @@ class TestMail(DirectoriesMixin, TestCase):
|
||||
self.assertEqual(len(self.bogus_mailbox.fetch(criteria, False)), 0)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||
|
||||
def test_tag_mail_action_applemail_wrong_input(self):
|
||||
|
||||
self.assertRaises(
|
||||
MailError,
|
||||
TagMailAction,
|
||||
"apple:black",
|
||||
)
|
||||
|
||||
def test_handle_mail_account_tag_applemail(self):
|
||||
# all mails will be FLAGGED afterwards
|
||||
|
||||
account = MailAccount.objects.create(
|
||||
name="test",
|
||||
imap_server="",
|
||||
username="admin",
|
||||
password="secret",
|
||||
)
|
||||
|
||||
_ = MailRule.objects.create(
|
||||
name="testrule",
|
||||
account=account,
|
||||
action=MailRule.MailAction.TAG,
|
||||
action_parameter="apple:green",
|
||||
)
|
||||
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||
self.assertEqual(self.async_task.call_count, 0)
|
||||
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 2)
|
||||
self.mail_account_handler.handle_mail_account(account)
|
||||
self.assertEqual(self.async_task.call_count, 2)
|
||||
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 0)
|
||||
self.assertEqual(len(self.bogus_mailbox.messages), 3)
|
||||
|
||||
def test_error_login(self):
|
||||
account = MailAccount.objects.create(
|
||||
name="test",
|
||||
|
@@ -2,6 +2,7 @@ import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
@@ -137,36 +138,27 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
if not os.path.isfile(pdf_file):
|
||||
return None
|
||||
|
||||
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
||||
|
||||
try:
|
||||
stripped = post_process_text(pdfminer_extract_text(pdf_file))
|
||||
text = None
|
||||
with tempfile.NamedTemporaryFile(
|
||||
mode="w+",
|
||||
dir=self.tempdir,
|
||||
) as tmp:
|
||||
subprocess.run(
|
||||
[
|
||||
"pdftotext",
|
||||
"-q",
|
||||
"-layout",
|
||||
"-enc",
|
||||
"UTF-8",
|
||||
pdf_file,
|
||||
tmp.name,
|
||||
],
|
||||
)
|
||||
text = tmp.read()
|
||||
|
||||
self.log("debug", f"Extracted text from PDF file {pdf_file}")
|
||||
return post_process_text(text)
|
||||
|
||||
# pdfminer.six does not handle RTL text
|
||||
# as a hack, for some languages, return no text, to force
|
||||
# OCRMyPdf/Tesseract do handle this correctly
|
||||
from langdetect import detect
|
||||
|
||||
lang = detect(stripped)
|
||||
|
||||
self.log("debug", f"Detected language {lang}")
|
||||
|
||||
if (
|
||||
lang
|
||||
in {
|
||||
"ar", # Arabic
|
||||
"he", # Hebrew,
|
||||
"fa", # Persian
|
||||
}
|
||||
and pdf_file.name != "archive-fallback.pdf"
|
||||
):
|
||||
raise RtlLanguageException()
|
||||
return stripped
|
||||
except RtlLanguageException:
|
||||
self.log("warning", f"Detected RTL language {lang}")
|
||||
return None
|
||||
except Exception:
|
||||
# TODO catch all for various issues with PDFminer.six.
|
||||
# If PDFminer fails, fall back to OCR.
|
||||
@@ -342,7 +334,7 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
)
|
||||
if original_has_text:
|
||||
self.text = text_original
|
||||
except (NoTextFoundException, RtlLanguageException, InputFileError) as e:
|
||||
except (NoTextFoundException, InputFileError) as e:
|
||||
self.log(
|
||||
"warning",
|
||||
f"Encountered an error while running OCR: {str(e)}. "
|
||||
|
@@ -670,28 +670,14 @@ class TestParser(DirectoriesMixin, TestCase):
|
||||
- Text from the document is extracted
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
with mock.patch.object(
|
||||
parser,
|
||||
"construct_ocrmypdf_parameters",
|
||||
wraps=parser.construct_ocrmypdf_parameters,
|
||||
) as wrapped:
|
||||
|
||||
parser.parse(
|
||||
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
parser.parse(
|
||||
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
|
||||
# There isn't a good way to actually check this working, with RTL correctly return
|
||||
# as it would require tesseract-ocr-ara installed for everyone running the
|
||||
# test suite. This test does provide the coverage though and attempts to ensure
|
||||
# the force OCR happens
|
||||
self.assertIsNotNone(parser.get_text())
|
||||
|
||||
self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
|
||||
# Check the last call kwargs
|
||||
self.assertTrue(
|
||||
parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
|
||||
)
|
||||
# Copied from the PDF to here. Don't even look at it
|
||||
self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text())
|
||||
|
||||
|
||||
class TestParserFileTypes(DirectoriesMixin, TestCase):
|
||||
|
Reference in New Issue
Block a user