Merge branch 'dev' into feature-permissions

This commit is contained in:
shamoon
2023-01-16 15:59:25 -08:00
committed by GitHub
29 changed files with 923 additions and 482 deletions

View File

@@ -19,7 +19,7 @@ from watchdog.observers.polling import PollingObserver
try:
from inotifyrecursive import INotify, flags
except ImportError:
except ImportError: # pragma: nocover
INotify = flags = None
logger = logging.getLogger("paperless.management.consumer")

View File

@@ -4,6 +4,9 @@ import os
import shutil
import tempfile
import time
from pathlib import Path
from typing import List
from typing import Set
import tqdm
from django.conf import settings
@@ -96,16 +99,16 @@ class Command(BaseCommand):
def __init__(self, *args, **kwargs):
BaseCommand.__init__(self, *args, **kwargs)
self.target = None
self.files_in_export_dir = []
self.exported_files = []
self.target: Path = None
self.files_in_export_dir: Set[Path] = set()
self.exported_files: List[Path] = []
self.compare_checksums = False
self.use_filename_format = False
self.delete = False
def handle(self, *args, **options):
self.target = options["target"]
self.target = Path(options["target"]).resolve()
self.compare_checksums = options["compare_checksums"]
self.use_filename_format = options["use_filename_format"]
self.delete = options["delete"]
@@ -121,11 +124,14 @@ class Command(BaseCommand):
dir=settings.SCRATCH_DIR,
prefix="paperless-export",
)
self.target = temp_dir.name
self.target = Path(temp_dir.name).resolve()
if not os.path.exists(self.target):
if not self.target.exists():
raise CommandError("That path doesn't exist")
if not self.target.is_dir():
raise CommandError("That path isn't a directory")
if not os.access(self.target, os.W_OK):
raise CommandError("That path doesn't appear to be writable")
@@ -152,10 +158,9 @@ class Command(BaseCommand):
def dump(self, progress_bar_disable=False):
# 1. Take a snapshot of what files exist in the current export folder
for root, dirs, files in os.walk(self.target):
self.files_in_export_dir.extend(
map(lambda f: os.path.abspath(os.path.join(root, f)), files),
)
for x in self.target.glob("**/*"):
if x.is_file():
self.files_in_export_dir.add(x.resolve())
# 2. Create manifest, containing all correspondents, types, tags, storage paths
# comments, documents and ui_settings
@@ -238,16 +243,16 @@ class Command(BaseCommand):
# 3.3. write filenames into manifest
original_name = base_name
original_target = os.path.join(self.target, original_name)
original_target = (self.target / Path(original_name)).resolve()
document_dict[EXPORTER_FILE_NAME] = original_name
thumbnail_name = base_name + "-thumbnail.webp"
thumbnail_target = os.path.join(self.target, thumbnail_name)
thumbnail_target = (self.target / Path(thumbnail_name)).resolve()
document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name
if document.has_archive_version:
archive_name = base_name + "-archive.pdf"
archive_target = os.path.join(self.target, archive_name)
archive_target = (self.target / Path(archive_name)).resolve()
document_dict[EXPORTER_ARCHIVE_NAME] = archive_name
else:
archive_target = None
@@ -256,24 +261,21 @@ class Command(BaseCommand):
t = int(time.mktime(document.created.timetuple()))
if document.storage_type == Document.STORAGE_TYPE_GPG:
os.makedirs(os.path.dirname(original_target), exist_ok=True)
with open(original_target, "wb") as f:
with document.source_file as out_file:
f.write(GnuPG.decrypted(out_file))
os.utime(original_target, times=(t, t))
original_target.parent.mkdir(parents=True, exist_ok=True)
with document.source_file as out_file:
original_target.write_bytes(GnuPG.decrypted(out_file))
os.utime(original_target, times=(t, t))
os.makedirs(os.path.dirname(thumbnail_target), exist_ok=True)
with open(thumbnail_target, "wb") as f:
with document.thumbnail_file as out_file:
f.write(GnuPG.decrypted(out_file))
os.utime(thumbnail_target, times=(t, t))
thumbnail_target.parent.mkdir(parents=True, exist_ok=True)
with document.thumbnail_file as out_file:
thumbnail_target.write_bytes(GnuPG.decrypted(out_file))
os.utime(thumbnail_target, times=(t, t))
if archive_target:
os.makedirs(os.path.dirname(archive_target), exist_ok=True)
with open(archive_target, "wb") as f:
with document.archive_path as out_file:
f.write(GnuPG.decrypted(out_file))
os.utime(archive_target, times=(t, t))
archive_target.parent.mkdir(parents=True, exist_ok=True)
with document.archive_path as out_file:
archive_target.write_bytes(GnuPG.decrypted(out_file))
os.utime(archive_target, times=(t, t))
else:
self.check_and_copy(
document.source_path,
@@ -291,16 +293,14 @@ class Command(BaseCommand):
)
# 4.1 write manifest to target folder
manifest_path = os.path.abspath(os.path.join(self.target, "manifest.json"))
with open(manifest_path, "w") as f:
json.dump(manifest, f, indent=2)
manifest_path = (self.target / Path("manifest.json")).resolve()
manifest_path.write_text(json.dumps(manifest, indent=2))
# 4.2 write version information to target folder
version_path = os.path.abspath(os.path.join(self.target, "version.json"))
with open(version_path, "w") as f:
json.dump({"version": version.__full_version_str__}, f, indent=2)
version_path = (self.target / Path("version.json")).resolve()
version_path.write_text(
json.dumps({"version": version.__full_version_str__}, indent=2),
)
if self.delete:
# 5. Remove files which we did not explicitly export in this run
@@ -309,25 +309,24 @@ class Command(BaseCommand):
self.files_in_export_dir.remove(manifest_path)
for f in self.files_in_export_dir:
os.remove(f)
f.unlink()
delete_empty_directories(
os.path.abspath(os.path.dirname(f)),
os.path.abspath(self.target),
f.parent,
self.target,
)
def check_and_copy(self, source, source_checksum, target):
if os.path.abspath(target) in self.files_in_export_dir:
self.files_in_export_dir.remove(os.path.abspath(target))
def check_and_copy(self, source, source_checksum, target: Path):
if target in self.files_in_export_dir:
self.files_in_export_dir.remove(target)
perform_copy = False
if os.path.exists(target):
if target.exists():
source_stat = os.stat(source)
target_stat = os.stat(target)
target_stat = target.stat()
if self.compare_checksums and source_checksum:
with open(target, "rb") as f:
target_checksum = hashlib.md5(f.read()).hexdigest()
target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
perform_copy = target_checksum != source_checksum
elif source_stat.st_mtime != target_stat.st_mtime:
perform_copy = True
@@ -338,5 +337,5 @@ class Command(BaseCommand):
perform_copy = True
if perform_copy:
os.makedirs(os.path.dirname(target), exist_ok=True)
target.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(source, target)

View File

@@ -447,7 +447,7 @@ def update_filename_and_move_files(sender, instance, **kwargs):
)
except (OSError, DatabaseError, CannotMoveFilesException) as e:
logger.warn(f"Exception during file handling: {e}")
logger.warning(f"Exception during file handling: {e}")
# This happens when either:
# - moving the files failed due to file system errors
# - saving to the database failed due to database errors

View File

@@ -3,10 +3,10 @@ import logging
import os
import shutil
import uuid
from datetime import datetime
from pathlib import Path
from typing import Type
import dateutil.parser
import tqdm
from asgiref.sync import async_to_sync
from celery import shared_task
@@ -107,7 +107,7 @@ def consume_file(
# More types will be retained through JSON encode/decode
if override_created is not None and isinstance(override_created, str):
try:
override_created = datetime.fromisoformat(override_created)
override_created = dateutil.parser.isoparse(override_created)
except Exception:
pass

View File

@@ -7,6 +7,7 @@ import tempfile
import urllib.request
import uuid
import zipfile
from pathlib import Path
from unittest import mock
from unittest.mock import MagicMock
@@ -812,7 +813,9 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
m.assert_called_once()
args, kwargs = m.call_args
self.assertEqual(kwargs["override_filename"], "simple.pdf")
file_path = Path(args[0])
self.assertEqual(file_path.name, "simple.pdf")
self.assertIn(Path(settings.SCRATCH_DIR), file_path.parents)
self.assertIsNone(kwargs["override_title"])
self.assertIsNone(kwargs["override_correspondent_id"])
self.assertIsNone(kwargs["override_document_type_id"])
@@ -837,7 +840,9 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
m.assert_called_once()
args, kwargs = m.call_args
self.assertEqual(kwargs["override_filename"], "simple.pdf")
file_path = Path(args[0])
self.assertEqual(file_path.name, "simple.pdf")
self.assertIn(Path(settings.SCRATCH_DIR), file_path.parents)
self.assertIsNone(kwargs["override_title"])
self.assertIsNone(kwargs["override_correspondent_id"])
self.assertIsNone(kwargs["override_document_type_id"])

View File

@@ -8,6 +8,7 @@ from unittest import mock
from zipfile import ZipFile
from django.core.management import call_command
from django.core.management.base import CommandError
from django.test import override_settings
from django.test import TestCase
from django.utils import timezone
@@ -438,3 +439,61 @@ class TestExportImport(DirectoriesMixin, TestCase):
self.assertEqual(len(zip.namelist()), 14)
self.assertIn("manifest.json", zip.namelist())
self.assertIn("version.json", zip.namelist())
def test_export_target_not_exists(self):
"""
GIVEN:
- Request to export documents to directory that doesn't exist
WHEN:
- Export command is called
THEN:
- Error is raised
"""
args = ["document_exporter", "/tmp/foo/bar"]
with self.assertRaises(CommandError) as e:
call_command(*args)
self.assertEqual("That path isn't a directory", str(e))
def test_export_target_exists_but_is_file(self):
"""
GIVEN:
- Request to export documents to file instead of directory
WHEN:
- Export command is called
THEN:
- Error is raised
"""
with tempfile.NamedTemporaryFile() as tmp_file:
args = ["document_exporter", tmp_file.name]
with self.assertRaises(CommandError) as e:
call_command(*args)
self.assertEqual("That path isn't a directory", str(e))
def test_export_target_not_writable(self):
"""
GIVEN:
- Request to export documents to directory that's not writeable
WHEN:
- Export command is called
THEN:
- Error is raised
"""
with tempfile.TemporaryDirectory() as tmp_dir:
os.chmod(tmp_dir, 0o000)
args = ["document_exporter", tmp_dir]
with self.assertRaises(CommandError) as e:
call_command(*args)
self.assertEqual("That path doesn't appear to be writable", str(e))

View File

@@ -8,10 +8,12 @@ import urllib
import uuid
import zipfile
from datetime import datetime
from pathlib import Path
from time import mktime
from unicodedata import normalize
from urllib.parse import quote
import pathvalidate
from django.conf import settings
from django.contrib.auth.models import User
from django.db.models import Case
@@ -33,6 +35,7 @@ from documents.filters import ObjectOwnedOrGrandtedPermissionsFilter
from documents.permissions import PaperlessAdminPermissions
from documents.permissions import PaperlessObjectPermissions
from documents.tasks import consume_file
from langdetect import detect
from packaging import version as packaging_version
from paperless import version
from paperless.db import GnuPG
@@ -361,6 +364,13 @@ class DocumentViewSet(
"original_filename": doc.original_filename,
}
lang = "en"
try:
lang = detect(doc.content)
except Exception:
pass
meta["lang"] = lang
if doc.has_archive_version:
meta["archive_size"] = self.get_filesize(doc.archive_path)
meta["archive_metadata"] = self.get_metadata(
@@ -658,20 +668,19 @@ class PostDocumentView(GenericAPIView):
os.makedirs(settings.SCRATCH_DIR, exist_ok=True)
with tempfile.NamedTemporaryFile(
prefix="paperless-upload-",
dir=settings.SCRATCH_DIR,
delete=False,
) as f:
f.write(doc_data)
os.utime(f.name, times=(t, t))
temp_filename = f.name
temp_file_path = Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR)) / Path(
pathvalidate.sanitize_filename(doc_name),
)
temp_file_path.write_bytes(doc_data)
os.utime(temp_file_path, times=(t, t))
task_id = str(uuid.uuid4())
async_task = consume_file.delay(
temp_filename,
override_filename=doc_name,
# Paths are not JSON friendly
str(temp_file_path),
override_title=title,
override_correspondent_id=correspondent_id,
override_document_type_id=document_type_id,

View File

@@ -5,6 +5,7 @@ import multiprocessing
import os
import re
import tempfile
from typing import Dict
from typing import Final
from typing import Optional
from typing import Set
@@ -107,6 +108,57 @@ def _parse_redis_url(env_redis: Optional[str]) -> Tuple[str]:
return (env_redis, env_redis)
def _parse_beat_schedule() -> Dict:
schedule = {}
tasks = [
{
"name": "Check all e-mail accounts",
"env_key": "PAPERLESS_EMAIL_TASK_CRON",
# Default every ten minutes
"env_default": "*/10 * * * *",
"task": "paperless_mail.tasks.process_mail_accounts",
},
{
"name": "Train the classifier",
"env_key": "PAPERLESS_TRAIN_TASK_CRON",
# Default hourly at 5 minutes past the hour
"env_default": "5 */1 * * *",
"task": "documents.tasks.train_classifier",
},
{
"name": "Optimize the index",
"env_key": "PAPERLESS_INDEX_TASK_CRON",
# Default daily at midnight
"env_default": "0 0 * * *",
"task": "documents.tasks.index_optimize",
},
{
"name": "Perform sanity check",
"env_key": "PAPERLESS_SANITY_TASK_CRON",
# Default Sunday at 00:30
"env_default": "30 0 * * sun",
"task": "documents.tasks.sanity_check",
},
]
for task in tasks:
# Either get the environment setting or use the default
value = os.getenv(task["env_key"], task["env_default"])
# Don't add disabled tasks to the schedule
if value == "disable":
continue
# I find https://crontab.guru/ super helpful
# crontab(5) format
# - five time-and-date fields
# - separated by at least one blank
minute, hour, day_month, month, day_week = value.split(" ")
schedule[task["name"]] = {
"task": task["task"],
"schedule": crontab(minute, hour, day_week, day_month, month),
}
return schedule
# NEVER RUN WITH DEBUG IN PRODUCTION.
DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO")
@@ -126,7 +178,7 @@ THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails")
DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data"))
NLTK_DIR = __get_path("PAPERLESS_NLTK_DIR", "/usr/local/share/nltk_data")
NLTK_DIR = __get_path("PAPERLESS_NLTK_DIR", "/usr/share/nltk_data")
TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR")
@@ -533,29 +585,10 @@ CELERY_RESULT_EXTENDED = True
CELERY_RESULT_BACKEND = "django-db"
CELERY_CACHE_BACKEND = "default"
# https://docs.celeryq.dev/en/stable/userguide/configuration.html#beat-schedule
CELERY_BEAT_SCHEDULE = _parse_beat_schedule()
CELERY_BEAT_SCHEDULE = {
# Every ten minutes
"Check all e-mail accounts": {
"task": "paperless_mail.tasks.process_mail_accounts",
"schedule": crontab(minute="*/10"),
},
# Hourly at 5 minutes past the hour
"Train the classifier": {
"task": "documents.tasks.train_classifier",
"schedule": crontab(minute="5", hour="*/1"),
},
# Daily at midnight
"Optimize the index": {
"task": "documents.tasks.index_optimize",
"schedule": crontab(minute=0, hour=0),
},
# Weekly, Sunday at 00:30
"Perform sanity check": {
"task": "documents.tasks.sanity_check",
"schedule": crontab(minute=30, hour=0, day_of_week="sun"),
},
}
# https://docs.celeryq.dev/en/stable/userguide/configuration.html#beat-schedule-filename
CELERY_BEAT_SCHEDULE_FILENAME = os.path.join(DATA_DIR, "celerybeat-schedule.db")
# django setting.

View File

@@ -1,7 +1,10 @@
import datetime
import os
from unittest import mock
from unittest import TestCase
from celery.schedules import crontab
from paperless.settings import _parse_beat_schedule
from paperless.settings import _parse_ignore_dates
from paperless.settings import _parse_redis_url
from paperless.settings import default_threads_per_worker
@@ -60,6 +63,8 @@ class TestIgnoreDateParsing(TestCase):
self._parse_checker(test_cases)
class TestThreadCalculation(TestCase):
def test_workers_threads(self):
"""
GIVEN:
@@ -84,6 +89,8 @@ class TestIgnoreDateParsing(TestCase):
self.assertLessEqual(default_workers * default_threads, i)
class TestRedisSocketConversion(TestCase):
def test_redis_socket_parsing(self):
"""
GIVEN:
@@ -139,3 +146,132 @@ class TestIgnoreDateParsing(TestCase):
]:
result = _parse_redis_url(input)
self.assertTupleEqual(expected, result)
class TestCeleryScheduleParsing(TestCase):
def test_schedule_configuration_default(self):
"""
GIVEN:
- No configured task schedules
WHEN:
- The celery beat schedule is built
THEN:
- The default schedule is returned
"""
schedule = _parse_beat_schedule()
self.assertDictEqual(
{
"Check all e-mail accounts": {
"task": "paperless_mail.tasks.process_mail_accounts",
"schedule": crontab(minute="*/10"),
},
"Train the classifier": {
"task": "documents.tasks.train_classifier",
"schedule": crontab(minute="5", hour="*/1"),
},
"Optimize the index": {
"task": "documents.tasks.index_optimize",
"schedule": crontab(minute=0, hour=0),
},
"Perform sanity check": {
"task": "documents.tasks.sanity_check",
"schedule": crontab(minute=30, hour=0, day_of_week="sun"),
},
},
schedule,
)
def test_schedule_configuration_changed(self):
"""
GIVEN:
- Email task is configured non-default
WHEN:
- The celery beat schedule is built
THEN:
- The email task is configured per environment
- The default schedule is returned for other tasks
"""
with mock.patch.dict(
os.environ,
{"PAPERLESS_EMAIL_TASK_CRON": "*/50 * * * mon"},
):
schedule = _parse_beat_schedule()
self.assertDictEqual(
{
"Check all e-mail accounts": {
"task": "paperless_mail.tasks.process_mail_accounts",
"schedule": crontab(minute="*/50", day_of_week="mon"),
},
"Train the classifier": {
"task": "documents.tasks.train_classifier",
"schedule": crontab(minute="5", hour="*/1"),
},
"Optimize the index": {
"task": "documents.tasks.index_optimize",
"schedule": crontab(minute=0, hour=0),
},
"Perform sanity check": {
"task": "documents.tasks.sanity_check",
"schedule": crontab(minute=30, hour=0, day_of_week="sun"),
},
},
schedule,
)
def test_schedule_configuration_disabled(self):
"""
GIVEN:
- Search index task is disabled
WHEN:
- The celery beat schedule is built
THEN:
- The search index task is not present
- The default schedule is returned for other tasks
"""
with mock.patch.dict(os.environ, {"PAPERLESS_INDEX_TASK_CRON": "disable"}):
schedule = _parse_beat_schedule()
self.assertDictEqual(
{
"Check all e-mail accounts": {
"task": "paperless_mail.tasks.process_mail_accounts",
"schedule": crontab(minute="*/10"),
},
"Train the classifier": {
"task": "documents.tasks.train_classifier",
"schedule": crontab(minute="5", hour="*/1"),
},
"Perform sanity check": {
"task": "documents.tasks.sanity_check",
"schedule": crontab(minute=30, hour=0, day_of_week="sun"),
},
},
schedule,
)
def test_schedule_configuration_disabled_all(self):
"""
GIVEN:
- All tasks are disabled
WHEN:
- The celery beat schedule is built
THEN:
- No tasks are scheduled
"""
with mock.patch.dict(
os.environ,
{
"PAPERLESS_EMAIL_TASK_CRON": "disable",
"PAPERLESS_TRAIN_TASK_CRON": "disable",
"PAPERLESS_SANITY_TASK_CRON": "disable",
"PAPERLESS_INDEX_TASK_CRON": "disable",
},
):
schedule = _parse_beat_schedule()
self.assertDictEqual(
{},
schedule,
)

View File

@@ -158,7 +158,7 @@ urlpatterns = [
websocket_urlpatterns = [
re_path(r"ws/status/$", StatusConsumer.as_asgi()),
path(settings.BASE_URL.lstrip("/") + "ws/status/", StatusConsumer.as_asgi()),
]
# Text in each page's <h1> (and above login form).

View File

@@ -1,3 +1,4 @@
import itertools
import os
import re
import tempfile
@@ -25,6 +26,28 @@ from imap_tools.mailbox import MailBoxTls
from paperless_mail.models import MailAccount
from paperless_mail.models import MailRule
# Apple Mail sets multiple IMAP KEYWORD and the general "\Flagged" FLAG
# imaplib => conn.fetch(b"<message_id>", "FLAGS")
# no flag - (FLAGS (\\Seen $NotJunk NotJunk))'
# red - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk))'
# orange - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk $MailFlagBit0))'
# yellow - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk $MailFlagBit1))'
# blue - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk $MailFlagBit2))'
# green - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk $MailFlagBit0 $MailFlagBit1))'
# violet - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk $MailFlagBit0 $MailFlagBit2))'
# grey - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk $MailFlagBit1 $MailFlagBit2))'
APPLE_MAIL_TAG_COLORS = {
"red": [],
"orange": ["$MailFlagBit0"],
"yellow": ["$MailFlagBit1"],
"blue": ["$MailFlagBit2"],
"green": ["$MailFlagBit0", "$MailFlagBit1"],
"violet": ["$MailFlagBit0", "$MailFlagBit2"],
"grey": ["$MailFlagBit1", "$MailFlagBit2"],
}
class MailError(Exception):
pass
@@ -66,18 +89,59 @@ class FlagMailAction(BaseMailAction):
class TagMailAction(BaseMailAction):
def __init__(self, parameter):
self.keyword = parameter
# The custom tag should look like "apple:<color>"
if "apple:" in parameter.lower():
_, self.color = parameter.split(":")
self.color = self.color.strip()
if not self.color.lower() in APPLE_MAIL_TAG_COLORS.keys():
raise MailError("Not a valid AppleMail tag color.")
self.keyword = None
else:
self.keyword = parameter
self.color = None
def get_criteria(self):
# AppleMail: We only need to check if mails are \Flagged
if self.color:
return {"flagged": False}
return {"no_keyword": self.keyword, "gmail_label": self.keyword}
def post_consume(self, M: MailBox, message_uids, parameter):
if re.search(r"gmail\.com$|googlemail\.com$", M._host):
for uid in message_uids:
M.client.uid("STORE", uid, "X-GM-LABELS", self.keyword)
else:
# AppleMail
elif self.color:
# Remove all existing $MailFlagBits
M.flag(
message_uids,
set(itertools.chain(*APPLE_MAIL_TAG_COLORS.values())),
False,
)
# Set new $MailFlagBits
M.flag(message_uids, APPLE_MAIL_TAG_COLORS.get(self.color), True)
# Set the general \Flagged
# This defaults to the "red" flag in AppleMail and
# "stars" in Thunderbird or GMail
M.flag(message_uids, [MailMessageFlags.FLAGGED], True)
elif self.keyword:
M.flag(message_uids, [self.keyword], True)
else:
raise MailError("No keyword specified.")
def get_rule_action(rule) -> BaseMailAction:
if rule.action == MailRule.MailAction.FLAG:
@@ -197,14 +261,14 @@ class MailAccountHandler(LoggingMixin):
try:
M.login_utf8(account.username, account.password)
except Exception as err:
except Exception as e:
self.log(
"error",
"Unable to authenticate with mail server using AUTH=PLAIN",
)
raise MailError(
f"Error while authenticating account {account}",
) from err
) from e
except Exception as e:
self.log(
"error",

View File

@@ -24,6 +24,7 @@ from imap_tools import NOT
from paperless_mail import tasks
from paperless_mail.mail import MailAccountHandler
from paperless_mail.mail import MailError
from paperless_mail.mail import TagMailAction
from paperless_mail.models import MailAccount
from paperless_mail.models import MailRule
@@ -674,6 +675,39 @@ class TestMail(DirectoriesMixin, TestCase):
self.assertEqual(len(self.bogus_mailbox.fetch(criteria, False)), 0)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
def test_tag_mail_action_applemail_wrong_input(self):
self.assertRaises(
MailError,
TagMailAction,
"apple:black",
)
def test_handle_mail_account_tag_applemail(self):
# all mails will be FLAGGED afterwards
account = MailAccount.objects.create(
name="test",
imap_server="",
username="admin",
password="secret",
)
_ = MailRule.objects.create(
name="testrule",
account=account,
action=MailRule.MailAction.TAG,
action_parameter="apple:green",
)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
self.assertEqual(self.async_task.call_count, 0)
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 2)
self.mail_account_handler.handle_mail_account(account)
self.assertEqual(self.async_task.call_count, 2)
self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 0)
self.assertEqual(len(self.bogus_mailbox.messages), 3)
def test_error_login(self):
account = MailAccount.objects.create(
name="test",

View File

@@ -2,6 +2,7 @@ import json
import os
import re
import subprocess
import tempfile
from pathlib import Path
from typing import Optional
@@ -137,36 +138,27 @@ class RasterisedDocumentParser(DocumentParser):
if not os.path.isfile(pdf_file):
return None
from pdfminer.high_level import extract_text as pdfminer_extract_text
try:
stripped = post_process_text(pdfminer_extract_text(pdf_file))
text = None
with tempfile.NamedTemporaryFile(
mode="w+",
dir=self.tempdir,
) as tmp:
subprocess.run(
[
"pdftotext",
"-q",
"-layout",
"-enc",
"UTF-8",
pdf_file,
tmp.name,
],
)
text = tmp.read()
self.log("debug", f"Extracted text from PDF file {pdf_file}")
return post_process_text(text)
# pdfminer.six does not handle RTL text
# as a hack, for some languages, return no text, to force
# OCRMyPdf/Tesseract do handle this correctly
from langdetect import detect
lang = detect(stripped)
self.log("debug", f"Detected language {lang}")
if (
lang
in {
"ar", # Arabic
"he", # Hebrew,
"fa", # Persian
}
and pdf_file.name != "archive-fallback.pdf"
):
raise RtlLanguageException()
return stripped
except RtlLanguageException:
self.log("warning", f"Detected RTL language {lang}")
return None
except Exception:
# TODO catch all for various issues with PDFminer.six.
# If PDFminer fails, fall back to OCR.
@@ -342,7 +334,7 @@ class RasterisedDocumentParser(DocumentParser):
)
if original_has_text:
self.text = text_original
except (NoTextFoundException, RtlLanguageException, InputFileError) as e:
except (NoTextFoundException, InputFileError) as e:
self.log(
"warning",
f"Encountered an error while running OCR: {str(e)}. "

View File

@@ -670,28 +670,14 @@ class TestParser(DirectoriesMixin, TestCase):
- Text from the document is extracted
"""
parser = RasterisedDocumentParser(None)
with mock.patch.object(
parser,
"construct_ocrmypdf_parameters",
wraps=parser.construct_ocrmypdf_parameters,
) as wrapped:
parser.parse(
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
"application/pdf",
)
parser.parse(
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
"application/pdf",
)
# There isn't a good way to actually check this working, with RTL correctly return
# as it would require tesseract-ocr-ara installed for everyone running the
# test suite. This test does provide the coverage though and attempts to ensure
# the force OCR happens
self.assertIsNotNone(parser.get_text())
self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
# Check the last call kwargs
self.assertTrue(
parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
)
# Copied from the PDF to here. Don't even look at it
self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text())
class TestParserFileTypes(DirectoriesMixin, TestCase):