Merge branch 'dev' into feature-permissions

2025-11-01 04:06:16 -05:00 · 2023-01-16 15:59:25 -08:00
parent d689a707a4 0b53a8981c
commit 0cfa5211e9
29 changed files with 923 additions and 482 deletions
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@@ -19,7 +19,7 @@ from watchdog.observers.polling import PollingObserver

 try:
    from inotifyrecursive import INotify, flags
-except ImportError:
+except ImportError:  # pragma: nocover
    INotify = flags = None

 logger = logging.getLogger("paperless.management.consumer")
--- a/src/documents/management/commands/document_exporter.py
+++ b/src/documents/management/commands/document_exporter.py
@@ -4,6 +4,9 @@ import os
 import shutil
 import tempfile
 import time
+from pathlib import Path
+from typing import List
+from typing import Set

 import tqdm
 from django.conf import settings
@@ -96,16 +99,16 @@ class Command(BaseCommand):

    def __init__(self, *args, **kwargs):
        BaseCommand.__init__(self, *args, **kwargs)
-        self.target = None
-        self.files_in_export_dir = []
-        self.exported_files = []
+        self.target: Path = None
+        self.files_in_export_dir: Set[Path] = set()
+        self.exported_files: List[Path] = []
        self.compare_checksums = False
        self.use_filename_format = False
        self.delete = False

    def handle(self, *args, **options):

-        self.target = options["target"]
+        self.target = Path(options["target"]).resolve()
        self.compare_checksums = options["compare_checksums"]
        self.use_filename_format = options["use_filename_format"]
        self.delete = options["delete"]
@@ -121,11 +124,14 @@ class Command(BaseCommand):
                dir=settings.SCRATCH_DIR,
                prefix="paperless-export",
            )
-            self.target = temp_dir.name
+            self.target = Path(temp_dir.name).resolve()

-        if not os.path.exists(self.target):
+        if not self.target.exists():
            raise CommandError("That path doesn't exist")

+        if not self.target.is_dir():
+            raise CommandError("That path isn't a directory")
+
        if not os.access(self.target, os.W_OK):
            raise CommandError("That path doesn't appear to be writable")

@@ -152,10 +158,9 @@ class Command(BaseCommand):

    def dump(self, progress_bar_disable=False):
        # 1. Take a snapshot of what files exist in the current export folder
-        for root, dirs, files in os.walk(self.target):
-            self.files_in_export_dir.extend(
-                map(lambda f: os.path.abspath(os.path.join(root, f)), files),
-            )
+        for x in self.target.glob("**/*"):
+            if x.is_file():
+                self.files_in_export_dir.add(x.resolve())

        # 2. Create manifest, containing all correspondents, types, tags, storage paths
        # comments, documents and ui_settings
@@ -238,16 +243,16 @@ class Command(BaseCommand):

            # 3.3. write filenames into manifest
            original_name = base_name
-            original_target = os.path.join(self.target, original_name)
+            original_target = (self.target / Path(original_name)).resolve()
            document_dict[EXPORTER_FILE_NAME] = original_name

            thumbnail_name = base_name + "-thumbnail.webp"
-            thumbnail_target = os.path.join(self.target, thumbnail_name)
+            thumbnail_target = (self.target / Path(thumbnail_name)).resolve()
            document_dict[EXPORTER_THUMBNAIL_NAME] = thumbnail_name

            if document.has_archive_version:
                archive_name = base_name + "-archive.pdf"
-                archive_target = os.path.join(self.target, archive_name)
+                archive_target = (self.target / Path(archive_name)).resolve()
                document_dict[EXPORTER_ARCHIVE_NAME] = archive_name
            else:
                archive_target = None
@@ -256,24 +261,21 @@ class Command(BaseCommand):
            t = int(time.mktime(document.created.timetuple()))
            if document.storage_type == Document.STORAGE_TYPE_GPG:

-                os.makedirs(os.path.dirname(original_target), exist_ok=True)
-                with open(original_target, "wb") as f:
-                    with document.source_file as out_file:
-                        f.write(GnuPG.decrypted(out_file))
-                        os.utime(original_target, times=(t, t))
+                original_target.parent.mkdir(parents=True, exist_ok=True)
+                with document.source_file as out_file:
+                    original_target.write_bytes(GnuPG.decrypted(out_file))
+                    os.utime(original_target, times=(t, t))

-                os.makedirs(os.path.dirname(thumbnail_target), exist_ok=True)
-                with open(thumbnail_target, "wb") as f:
-                    with document.thumbnail_file as out_file:
-                        f.write(GnuPG.decrypted(out_file))
-                        os.utime(thumbnail_target, times=(t, t))
+                thumbnail_target.parent.mkdir(parents=True, exist_ok=True)
+                with document.thumbnail_file as out_file:
+                    thumbnail_target.write_bytes(GnuPG.decrypted(out_file))
+                    os.utime(thumbnail_target, times=(t, t))

                if archive_target:
-                    os.makedirs(os.path.dirname(archive_target), exist_ok=True)
-                    with open(archive_target, "wb") as f:
-                        with document.archive_path as out_file:
-                            f.write(GnuPG.decrypted(out_file))
-                            os.utime(archive_target, times=(t, t))
+                    archive_target.parent.mkdir(parents=True, exist_ok=True)
+                    with document.archive_path as out_file:
+                        archive_target.write_bytes(GnuPG.decrypted(out_file))
+                        os.utime(archive_target, times=(t, t))
            else:
                self.check_and_copy(
                    document.source_path,
@@ -291,16 +293,14 @@ class Command(BaseCommand):
                    )

        # 4.1 write manifest to target folder
-        manifest_path = os.path.abspath(os.path.join(self.target, "manifest.json"))
-
-        with open(manifest_path, "w") as f:
-            json.dump(manifest, f, indent=2)
+        manifest_path = (self.target / Path("manifest.json")).resolve()
+        manifest_path.write_text(json.dumps(manifest, indent=2))

        # 4.2 write version information to target folder
-        version_path = os.path.abspath(os.path.join(self.target, "version.json"))
-
-        with open(version_path, "w") as f:
-            json.dump({"version": version.__full_version_str__}, f, indent=2)
+        version_path = (self.target / Path("version.json")).resolve()
+        version_path.write_text(
+            json.dumps({"version": version.__full_version_str__}, indent=2),
+        )

        if self.delete:
            # 5. Remove files which we did not explicitly export in this run
@@ -309,25 +309,24 @@ class Command(BaseCommand):
                self.files_in_export_dir.remove(manifest_path)

            for f in self.files_in_export_dir:
-                os.remove(f)
+                f.unlink()

                delete_empty_directories(
-                    os.path.abspath(os.path.dirname(f)),
-                    os.path.abspath(self.target),
+                    f.parent,
+                    self.target,
                )

-    def check_and_copy(self, source, source_checksum, target):
-        if os.path.abspath(target) in self.files_in_export_dir:
-            self.files_in_export_dir.remove(os.path.abspath(target))
+    def check_and_copy(self, source, source_checksum, target: Path):
+        if target in self.files_in_export_dir:
+            self.files_in_export_dir.remove(target)

        perform_copy = False

-        if os.path.exists(target):
+        if target.exists():
            source_stat = os.stat(source)
-            target_stat = os.stat(target)
+            target_stat = target.stat()
            if self.compare_checksums and source_checksum:
-                with open(target, "rb") as f:
-                    target_checksum = hashlib.md5(f.read()).hexdigest()
+                target_checksum = hashlib.md5(target.read_bytes()).hexdigest()
                perform_copy = target_checksum != source_checksum
            elif source_stat.st_mtime != target_stat.st_mtime:
                perform_copy = True
@@ -338,5 +337,5 @@ class Command(BaseCommand):
            perform_copy = True

        if perform_copy:
-            os.makedirs(os.path.dirname(target), exist_ok=True)
+            target.parent.mkdir(parents=True, exist_ok=True)
            shutil.copy2(source, target)
--- a/src/documents/signals/handlers.py
+++ b/src/documents/signals/handlers.py
@@ -447,7 +447,7 @@ def update_filename_and_move_files(sender, instance, **kwargs):
            )

        except (OSError, DatabaseError, CannotMoveFilesException) as e:
-            logger.warn(f"Exception during file handling: {e}")
+            logger.warning(f"Exception during file handling: {e}")
            # This happens when either:
            #  - moving the files failed due to file system errors
            #  - saving to the database failed due to database errors
--- a/src/documents/tasks.py
+++ b/src/documents/tasks.py
@@ -3,10 +3,10 @@ import logging
 import os
 import shutil
 import uuid
-from datetime import datetime
 from pathlib import Path
 from typing import Type

+import dateutil.parser
 import tqdm
 from asgiref.sync import async_to_sync
 from celery import shared_task
@@ -107,7 +107,7 @@ def consume_file(
    # More types will be retained through JSON encode/decode
    if override_created is not None and isinstance(override_created, str):
        try:
-            override_created = datetime.fromisoformat(override_created)
+            override_created = dateutil.parser.isoparse(override_created)
        except Exception:
            pass

--- a/src/documents/tests/test_api.py
+++ b/src/documents/tests/test_api.py
@@ -7,6 +7,7 @@ import tempfile
 import urllib.request
 import uuid
 import zipfile
+from pathlib import Path
 from unittest import mock
 from unittest.mock import MagicMock

@@ -812,7 +813,9 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        m.assert_called_once()

        args, kwargs = m.call_args
-        self.assertEqual(kwargs["override_filename"], "simple.pdf")
+        file_path = Path(args[0])
+        self.assertEqual(file_path.name, "simple.pdf")
+        self.assertIn(Path(settings.SCRATCH_DIR), file_path.parents)
        self.assertIsNone(kwargs["override_title"])
        self.assertIsNone(kwargs["override_correspondent_id"])
        self.assertIsNone(kwargs["override_document_type_id"])
@@ -837,7 +840,9 @@ class TestDocumentApi(DirectoriesMixin, APITestCase):
        m.assert_called_once()

        args, kwargs = m.call_args
-        self.assertEqual(kwargs["override_filename"], "simple.pdf")
+        file_path = Path(args[0])
+        self.assertEqual(file_path.name, "simple.pdf")
+        self.assertIn(Path(settings.SCRATCH_DIR), file_path.parents)
        self.assertIsNone(kwargs["override_title"])
        self.assertIsNone(kwargs["override_correspondent_id"])
        self.assertIsNone(kwargs["override_document_type_id"])
--- a/src/documents/tests/test_management_exporter.py
+++ b/src/documents/tests/test_management_exporter.py
@@ -8,6 +8,7 @@ from unittest import mock
 from zipfile import ZipFile

 from django.core.management import call_command
+from django.core.management.base import CommandError
 from django.test import override_settings
 from django.test import TestCase
 from django.utils import timezone
@@ -438,3 +439,61 @@ class TestExportImport(DirectoriesMixin, TestCase):
            self.assertEqual(len(zip.namelist()), 14)
            self.assertIn("manifest.json", zip.namelist())
            self.assertIn("version.json", zip.namelist())
+
+    def test_export_target_not_exists(self):
+        """
+        GIVEN:
+            - Request to export documents to directory that doesn't exist
+        WHEN:
+            - Export command is called
+        THEN:
+            - Error is raised
+        """
+        args = ["document_exporter", "/tmp/foo/bar"]
+
+        with self.assertRaises(CommandError) as e:
+
+            call_command(*args)
+
+            self.assertEqual("That path isn't a directory", str(e))
+
+    def test_export_target_exists_but_is_file(self):
+        """
+        GIVEN:
+            - Request to export documents to file instead of directory
+        WHEN:
+            - Export command is called
+        THEN:
+            - Error is raised
+        """
+
+        with tempfile.NamedTemporaryFile() as tmp_file:
+
+            args = ["document_exporter", tmp_file.name]
+
+            with self.assertRaises(CommandError) as e:
+
+                call_command(*args)
+
+                self.assertEqual("That path isn't a directory", str(e))
+
+    def test_export_target_not_writable(self):
+        """
+        GIVEN:
+            - Request to export documents to directory that's not writeable
+        WHEN:
+            - Export command is called
+        THEN:
+            - Error is raised
+        """
+        with tempfile.TemporaryDirectory() as tmp_dir:
+
+            os.chmod(tmp_dir, 0o000)
+
+            args = ["document_exporter", tmp_dir]
+
+            with self.assertRaises(CommandError) as e:
+
+                call_command(*args)
+
+                self.assertEqual("That path doesn't appear to be writable", str(e))
--- a/src/documents/views.py
+++ b/src/documents/views.py
@@ -8,10 +8,12 @@ import urllib
 import uuid
 import zipfile
 from datetime import datetime
+from pathlib import Path
 from time import mktime
 from unicodedata import normalize
 from urllib.parse import quote

+import pathvalidate
 from django.conf import settings
 from django.contrib.auth.models import User
 from django.db.models import Case
@@ -33,6 +35,7 @@ from documents.filters import ObjectOwnedOrGrandtedPermissionsFilter
 from documents.permissions import PaperlessAdminPermissions
 from documents.permissions import PaperlessObjectPermissions
 from documents.tasks import consume_file
+from langdetect import detect
 from packaging import version as packaging_version
 from paperless import version
 from paperless.db import GnuPG
@@ -361,6 +364,13 @@ class DocumentViewSet(
            "original_filename": doc.original_filename,
        }

+        lang = "en"
+        try:
+            lang = detect(doc.content)
+        except Exception:
+            pass
+        meta["lang"] = lang
+
        if doc.has_archive_version:
            meta["archive_size"] = self.get_filesize(doc.archive_path)
            meta["archive_metadata"] = self.get_metadata(
@@ -658,20 +668,19 @@ class PostDocumentView(GenericAPIView):

        os.makedirs(settings.SCRATCH_DIR, exist_ok=True)

-        with tempfile.NamedTemporaryFile(
-            prefix="paperless-upload-",
-            dir=settings.SCRATCH_DIR,
-            delete=False,
-        ) as f:
-            f.write(doc_data)
-            os.utime(f.name, times=(t, t))
-            temp_filename = f.name
+        temp_file_path = Path(tempfile.mkdtemp(dir=settings.SCRATCH_DIR)) / Path(
+            pathvalidate.sanitize_filename(doc_name),
+        )
+
+        temp_file_path.write_bytes(doc_data)
+
+        os.utime(temp_file_path, times=(t, t))

        task_id = str(uuid.uuid4())

        async_task = consume_file.delay(
-            temp_filename,
-            override_filename=doc_name,
+            # Paths are not JSON friendly
+            str(temp_file_path),
            override_title=title,
            override_correspondent_id=correspondent_id,
            override_document_type_id=document_type_id,
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -5,6 +5,7 @@ import multiprocessing
 import os
 import re
 import tempfile
+from typing import Dict
 from typing import Final
 from typing import Optional
 from typing import Set
@@ -107,6 +108,57 @@ def _parse_redis_url(env_redis: Optional[str]) -> Tuple[str]:
    return (env_redis, env_redis)


+def _parse_beat_schedule() -> Dict:
+    schedule = {}
+    tasks = [
+        {
+            "name": "Check all e-mail accounts",
+            "env_key": "PAPERLESS_EMAIL_TASK_CRON",
+            # Default every ten minutes
+            "env_default": "*/10 * * * *",
+            "task": "paperless_mail.tasks.process_mail_accounts",
+        },
+        {
+            "name": "Train the classifier",
+            "env_key": "PAPERLESS_TRAIN_TASK_CRON",
+            # Default hourly at 5 minutes past the hour
+            "env_default": "5 */1 * * *",
+            "task": "documents.tasks.train_classifier",
+        },
+        {
+            "name": "Optimize the index",
+            "env_key": "PAPERLESS_INDEX_TASK_CRON",
+            # Default daily at midnight
+            "env_default": "0 0 * * *",
+            "task": "documents.tasks.index_optimize",
+        },
+        {
+            "name": "Perform sanity check",
+            "env_key": "PAPERLESS_SANITY_TASK_CRON",
+            # Default Sunday at 00:30
+            "env_default": "30 0 * * sun",
+            "task": "documents.tasks.sanity_check",
+        },
+    ]
+    for task in tasks:
+        # Either get the environment setting or use the default
+        value = os.getenv(task["env_key"], task["env_default"])
+        # Don't add disabled tasks to the schedule
+        if value == "disable":
+            continue
+        # I find https://crontab.guru/ super helpful
+        # crontab(5) format
+        #   - five time-and-date fields
+        #   - separated by at least one blank
+        minute, hour, day_month, month, day_week = value.split(" ")
+        schedule[task["name"]] = {
+            "task": task["task"],
+            "schedule": crontab(minute, hour, day_week, day_month, month),
+        }
+
+    return schedule
+
+
 # NEVER RUN WITH DEBUG IN PRODUCTION.
 DEBUG = __get_boolean("PAPERLESS_DEBUG", "NO")

@@ -126,7 +178,7 @@ THUMBNAIL_DIR = os.path.join(MEDIA_ROOT, "documents", "thumbnails")

 DATA_DIR = __get_path("PAPERLESS_DATA_DIR", os.path.join(BASE_DIR, "..", "data"))

-NLTK_DIR = __get_path("PAPERLESS_NLTK_DIR", "/usr/local/share/nltk_data")
+NLTK_DIR = __get_path("PAPERLESS_NLTK_DIR", "/usr/share/nltk_data")

 TRASH_DIR = os.getenv("PAPERLESS_TRASH_DIR")

@@ -533,29 +585,10 @@ CELERY_RESULT_EXTENDED = True
 CELERY_RESULT_BACKEND = "django-db"
 CELERY_CACHE_BACKEND = "default"

+# https://docs.celeryq.dev/en/stable/userguide/configuration.html#beat-schedule
+CELERY_BEAT_SCHEDULE = _parse_beat_schedule()

-CELERY_BEAT_SCHEDULE = {
-    # Every ten minutes
-    "Check all e-mail accounts": {
-        "task": "paperless_mail.tasks.process_mail_accounts",
-        "schedule": crontab(minute="*/10"),
-    },
-    # Hourly at 5 minutes past the hour
-    "Train the classifier": {
-        "task": "documents.tasks.train_classifier",
-        "schedule": crontab(minute="5", hour="*/1"),
-    },
-    # Daily at midnight
-    "Optimize the index": {
-        "task": "documents.tasks.index_optimize",
-        "schedule": crontab(minute=0, hour=0),
-    },
-    # Weekly, Sunday at 00:30
-    "Perform sanity check": {
-        "task": "documents.tasks.sanity_check",
-        "schedule": crontab(minute=30, hour=0, day_of_week="sun"),
-    },
-}
+# https://docs.celeryq.dev/en/stable/userguide/configuration.html#beat-schedule-filename
 CELERY_BEAT_SCHEDULE_FILENAME = os.path.join(DATA_DIR, "celerybeat-schedule.db")

 # django setting.
--- a/src/paperless/tests/test_settings.py
+++ b/src/paperless/tests/test_settings.py
@@ -1,7 +1,10 @@
 import datetime
+import os
 from unittest import mock
 from unittest import TestCase

+from celery.schedules import crontab
+from paperless.settings import _parse_beat_schedule
 from paperless.settings import _parse_ignore_dates
 from paperless.settings import _parse_redis_url
 from paperless.settings import default_threads_per_worker
@@ -60,6 +63,8 @@ class TestIgnoreDateParsing(TestCase):

        self._parse_checker(test_cases)

+
+class TestThreadCalculation(TestCase):
    def test_workers_threads(self):
        """
        GIVEN:
@@ -84,6 +89,8 @@ class TestIgnoreDateParsing(TestCase):

                self.assertLessEqual(default_workers * default_threads, i)

+
+class TestRedisSocketConversion(TestCase):
    def test_redis_socket_parsing(self):
        """
        GIVEN:
@@ -139,3 +146,132 @@ class TestIgnoreDateParsing(TestCase):
        ]:
            result = _parse_redis_url(input)
            self.assertTupleEqual(expected, result)
+
+
+class TestCeleryScheduleParsing(TestCase):
+    def test_schedule_configuration_default(self):
+        """
+        GIVEN:
+            - No configured task schedules
+        WHEN:
+            - The celery beat schedule is built
+        THEN:
+            - The default schedule is returned
+        """
+        schedule = _parse_beat_schedule()
+
+        self.assertDictEqual(
+            {
+                "Check all e-mail accounts": {
+                    "task": "paperless_mail.tasks.process_mail_accounts",
+                    "schedule": crontab(minute="*/10"),
+                },
+                "Train the classifier": {
+                    "task": "documents.tasks.train_classifier",
+                    "schedule": crontab(minute="5", hour="*/1"),
+                },
+                "Optimize the index": {
+                    "task": "documents.tasks.index_optimize",
+                    "schedule": crontab(minute=0, hour=0),
+                },
+                "Perform sanity check": {
+                    "task": "documents.tasks.sanity_check",
+                    "schedule": crontab(minute=30, hour=0, day_of_week="sun"),
+                },
+            },
+            schedule,
+        )
+
+    def test_schedule_configuration_changed(self):
+        """
+        GIVEN:
+            - Email task is configured non-default
+        WHEN:
+            - The celery beat schedule is built
+        THEN:
+            - The email task is configured per environment
+            - The default schedule is returned for other tasks
+        """
+        with mock.patch.dict(
+            os.environ,
+            {"PAPERLESS_EMAIL_TASK_CRON": "*/50 * * * mon"},
+        ):
+            schedule = _parse_beat_schedule()
+
+        self.assertDictEqual(
+            {
+                "Check all e-mail accounts": {
+                    "task": "paperless_mail.tasks.process_mail_accounts",
+                    "schedule": crontab(minute="*/50", day_of_week="mon"),
+                },
+                "Train the classifier": {
+                    "task": "documents.tasks.train_classifier",
+                    "schedule": crontab(minute="5", hour="*/1"),
+                },
+                "Optimize the index": {
+                    "task": "documents.tasks.index_optimize",
+                    "schedule": crontab(minute=0, hour=0),
+                },
+                "Perform sanity check": {
+                    "task": "documents.tasks.sanity_check",
+                    "schedule": crontab(minute=30, hour=0, day_of_week="sun"),
+                },
+            },
+            schedule,
+        )
+
+    def test_schedule_configuration_disabled(self):
+        """
+        GIVEN:
+            - Search index task is disabled
+        WHEN:
+            - The celery beat schedule is built
+        THEN:
+            - The search index task is not present
+            - The default schedule is returned for other tasks
+        """
+        with mock.patch.dict(os.environ, {"PAPERLESS_INDEX_TASK_CRON": "disable"}):
+            schedule = _parse_beat_schedule()
+
+        self.assertDictEqual(
+            {
+                "Check all e-mail accounts": {
+                    "task": "paperless_mail.tasks.process_mail_accounts",
+                    "schedule": crontab(minute="*/10"),
+                },
+                "Train the classifier": {
+                    "task": "documents.tasks.train_classifier",
+                    "schedule": crontab(minute="5", hour="*/1"),
+                },
+                "Perform sanity check": {
+                    "task": "documents.tasks.sanity_check",
+                    "schedule": crontab(minute=30, hour=0, day_of_week="sun"),
+                },
+            },
+            schedule,
+        )
+
+    def test_schedule_configuration_disabled_all(self):
+        """
+        GIVEN:
+            - All tasks are disabled
+        WHEN:
+            - The celery beat schedule is built
+        THEN:
+            - No tasks are scheduled
+        """
+        with mock.patch.dict(
+            os.environ,
+            {
+                "PAPERLESS_EMAIL_TASK_CRON": "disable",
+                "PAPERLESS_TRAIN_TASK_CRON": "disable",
+                "PAPERLESS_SANITY_TASK_CRON": "disable",
+                "PAPERLESS_INDEX_TASK_CRON": "disable",
+            },
+        ):
+            schedule = _parse_beat_schedule()
+
+        self.assertDictEqual(
+            {},
+            schedule,
+        )
--- a/src/paperless/urls.py
+++ b/src/paperless/urls.py
@@ -158,7 +158,7 @@ urlpatterns = [


 websocket_urlpatterns = [
-    re_path(r"ws/status/$", StatusConsumer.as_asgi()),
+    path(settings.BASE_URL.lstrip("/") + "ws/status/", StatusConsumer.as_asgi()),
 ]

 # Text in each page's <h1> (and above login form).
--- a/src/paperless_mail/mail.py
+++ b/src/paperless_mail/mail.py
@@ -1,3 +1,4 @@
+import itertools
 import os
 import re
 import tempfile
@@ -25,6 +26,28 @@ from imap_tools.mailbox import MailBoxTls
 from paperless_mail.models import MailAccount
 from paperless_mail.models import MailRule

+# Apple Mail sets multiple IMAP KEYWORD and the general "\Flagged" FLAG
+# imaplib => conn.fetch(b"<message_id>", "FLAGS")
+
+# no flag   - (FLAGS (\\Seen $NotJunk NotJunk))'
+# red       - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk))'
+# orange    - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk $MailFlagBit0))'
+# yellow    - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk $MailFlagBit1))'
+# blue      - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk $MailFlagBit2))'
+# green     - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk $MailFlagBit0 $MailFlagBit1))'
+# violet    - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk $MailFlagBit0 $MailFlagBit2))'
+# grey      - (FLAGS (\\Flagged \\Seen $NotJunk NotJunk $MailFlagBit1 $MailFlagBit2))'
+
+APPLE_MAIL_TAG_COLORS = {
+    "red": [],
+    "orange": ["$MailFlagBit0"],
+    "yellow": ["$MailFlagBit1"],
+    "blue": ["$MailFlagBit2"],
+    "green": ["$MailFlagBit0", "$MailFlagBit1"],
+    "violet": ["$MailFlagBit0", "$MailFlagBit2"],
+    "grey": ["$MailFlagBit1", "$MailFlagBit2"],
+}
+

 class MailError(Exception):
    pass
@@ -66,18 +89,59 @@ class FlagMailAction(BaseMailAction):

 class TagMailAction(BaseMailAction):
    def __init__(self, parameter):
-        self.keyword = parameter
+
+        # The custom tag should look like "apple:<color>"
+        if "apple:" in parameter.lower():
+
+            _, self.color = parameter.split(":")
+            self.color = self.color.strip()
+
+            if not self.color.lower() in APPLE_MAIL_TAG_COLORS.keys():
+                raise MailError("Not a valid AppleMail tag color.")
+
+            self.keyword = None
+
+        else:
+            self.keyword = parameter
+            self.color = None

    def get_criteria(self):
+
+        # AppleMail: We only need to check if mails are \Flagged
+        if self.color:
+            return {"flagged": False}
+
        return {"no_keyword": self.keyword, "gmail_label": self.keyword}

    def post_consume(self, M: MailBox, message_uids, parameter):
        if re.search(r"gmail\.com$|googlemail\.com$", M._host):
            for uid in message_uids:
                M.client.uid("STORE", uid, "X-GM-LABELS", self.keyword)
-        else:
+
+        # AppleMail
+        elif self.color:
+
+            # Remove all existing $MailFlagBits
+            M.flag(
+                message_uids,
+                set(itertools.chain(*APPLE_MAIL_TAG_COLORS.values())),
+                False,
+            )
+
+            # Set new $MailFlagBits
+            M.flag(message_uids, APPLE_MAIL_TAG_COLORS.get(self.color), True)
+
+            # Set the general \Flagged
+            # This defaults to the "red" flag in AppleMail and
+            # "stars" in Thunderbird or GMail
+            M.flag(message_uids, [MailMessageFlags.FLAGGED], True)
+
+        elif self.keyword:
            M.flag(message_uids, [self.keyword], True)

+        else:
+            raise MailError("No keyword specified.")
+

 def get_rule_action(rule) -> BaseMailAction:
    if rule.action == MailRule.MailAction.FLAG:
@@ -197,14 +261,14 @@ class MailAccountHandler(LoggingMixin):

                    try:
                        M.login_utf8(account.username, account.password)
-                    except Exception as err:
+                    except Exception as e:
                        self.log(
                            "error",
                            "Unable to authenticate with mail server using AUTH=PLAIN",
                        )
                        raise MailError(
                            f"Error while authenticating account {account}",
-                        ) from err
+                        ) from e
                except Exception as e:
                    self.log(
                        "error",
--- a/src/paperless_mail/tests/test_mail.py
+++ b/src/paperless_mail/tests/test_mail.py
@@ -24,6 +24,7 @@ from imap_tools import NOT
 from paperless_mail import tasks
 from paperless_mail.mail import MailAccountHandler
 from paperless_mail.mail import MailError
+from paperless_mail.mail import TagMailAction
 from paperless_mail.models import MailAccount
 from paperless_mail.models import MailRule

@@ -674,6 +675,39 @@ class TestMail(DirectoriesMixin, TestCase):
        self.assertEqual(len(self.bogus_mailbox.fetch(criteria, False)), 0)
        self.assertEqual(len(self.bogus_mailbox.messages), 3)

+    def test_tag_mail_action_applemail_wrong_input(self):
+
+        self.assertRaises(
+            MailError,
+            TagMailAction,
+            "apple:black",
+        )
+
+    def test_handle_mail_account_tag_applemail(self):
+        # all mails will be FLAGGED afterwards
+
+        account = MailAccount.objects.create(
+            name="test",
+            imap_server="",
+            username="admin",
+            password="secret",
+        )
+
+        _ = MailRule.objects.create(
+            name="testrule",
+            account=account,
+            action=MailRule.MailAction.TAG,
+            action_parameter="apple:green",
+        )
+
+        self.assertEqual(len(self.bogus_mailbox.messages), 3)
+        self.assertEqual(self.async_task.call_count, 0)
+        self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 2)
+        self.mail_account_handler.handle_mail_account(account)
+        self.assertEqual(self.async_task.call_count, 2)
+        self.assertEqual(len(self.bogus_mailbox.fetch("UNFLAGGED", False)), 0)
+        self.assertEqual(len(self.bogus_mailbox.messages), 3)
+
    def test_error_login(self):
        account = MailAccount.objects.create(
            name="test",
--- a/src/paperless_tesseract/parsers.py
+++ b/src/paperless_tesseract/parsers.py
@@ -2,6 +2,7 @@ import json
 import os
 import re
 import subprocess
+import tempfile
 from pathlib import Path
 from typing import Optional

@@ -137,36 +138,27 @@ class RasterisedDocumentParser(DocumentParser):
        if not os.path.isfile(pdf_file):
            return None

-        from pdfminer.high_level import extract_text as pdfminer_extract_text
-
        try:
-            stripped = post_process_text(pdfminer_extract_text(pdf_file))
+            text = None
+            with tempfile.NamedTemporaryFile(
+                mode="w+",
+                dir=self.tempdir,
+            ) as tmp:
+                subprocess.run(
+                    [
+                        "pdftotext",
+                        "-q",
+                        "-layout",
+                        "-enc",
+                        "UTF-8",
+                        pdf_file,
+                        tmp.name,
+                    ],
+                )
+                text = tmp.read()

-            self.log("debug", f"Extracted text from PDF file {pdf_file}")
+            return post_process_text(text)

-            # pdfminer.six does not handle RTL text
-            # as a hack, for some languages, return no text, to force
-            # OCRMyPdf/Tesseract do handle this correctly
-            from langdetect import detect
-
-            lang = detect(stripped)
-
-            self.log("debug", f"Detected language {lang}")
-
-            if (
-                lang
-                in {
-                    "ar",  # Arabic
-                    "he",  # Hebrew,
-                    "fa",  # Persian
-                }
-                and pdf_file.name != "archive-fallback.pdf"
-            ):
-                raise RtlLanguageException()
-            return stripped
-        except RtlLanguageException:
-            self.log("warning", f"Detected RTL language {lang}")
-            return None
        except Exception:
            # TODO catch all for various issues with PDFminer.six.
            #  If PDFminer fails, fall back to OCR.
@@ -342,7 +334,7 @@ class RasterisedDocumentParser(DocumentParser):
            )
            if original_has_text:
                self.text = text_original
-        except (NoTextFoundException, RtlLanguageException, InputFileError) as e:
+        except (NoTextFoundException, InputFileError) as e:
            self.log(
                "warning",
                f"Encountered an error while running OCR: {str(e)}. "
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -670,28 +670,14 @@ class TestParser(DirectoriesMixin, TestCase):
            - Text from the document is extracted
        """
        parser = RasterisedDocumentParser(None)
-        with mock.patch.object(
-            parser,
-            "construct_ocrmypdf_parameters",
-            wraps=parser.construct_ocrmypdf_parameters,
-        ) as wrapped:

-            parser.parse(
-                os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
-                "application/pdf",
-            )
+        parser.parse(
+            os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"),
+            "application/pdf",
+        )

-            # There isn't a good way to actually check this working, with RTL correctly return
-            #  as it would require tesseract-ocr-ara installed for everyone running the
-            #  test suite.  This test does provide the coverage though and attempts to ensure
-            # the force OCR happens
-            self.assertIsNotNone(parser.get_text())
-
-            self.assertEqual(parser.construct_ocrmypdf_parameters.call_count, 2)
-            # Check the last call kwargs
-            self.assertTrue(
-                parser.construct_ocrmypdf_parameters.call_args.kwargs["safe_fallback"],
-            )
+        # Copied from the PDF to here.  Don't even look at it
+        self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text())


 class TestParserFileTypes(DirectoriesMixin, TestCase):