Chore: switch from os.path to pathlib.Path (#10539)

2026-01-30 23:08:59 -06:00 · 2025-09-03 17:12:41 +02:00
parent cc621cf729
commit d2064a2535
11 changed files with 151 additions and 165 deletions
--- a/src/documents/management/commands/document_consumer.py
+++ b/src/documents/management/commands/document_consumer.py
@@ -32,7 +32,7 @@ except ImportError:  # pragma: no cover
 logger = logging.getLogger("paperless.management.consumer")


-def _tags_from_path(filepath) -> list[int]:
+def _tags_from_path(filepath: Path) -> list[int]:
    """
    Walk up the directory tree from filepath to CONSUMPTION_DIR
    and get or create Tag IDs for every directory.
@@ -41,7 +41,7 @@ def _tags_from_path(filepath) -> list[int]:
    """
    db.close_old_connections()
    tag_ids = set()
-    path_parts = Path(filepath).relative_to(settings.CONSUMPTION_DIR).parent.parts
+    path_parts = filepath.relative_to(settings.CONSUMPTION_DIR).parent.parts
    for part in path_parts:
        tag_ids.add(
            Tag.objects.get_or_create(name__iexact=part, defaults={"name": part})[0].pk,
@@ -50,17 +50,13 @@ def _tags_from_path(filepath) -> list[int]:
    return list(tag_ids)


-def _is_ignored(filepath: str) -> bool:
+def _is_ignored(filepath: Path) -> bool:
    """
    Checks if the given file should be ignored, based on configured
    patterns.

    Returns True if the file is ignored, False otherwise
    """
-    filepath = os.path.abspath(
-        os.path.normpath(filepath),
-    )
-
    # Trim out the consume directory, leaving only filename and it's
    # path relative to the consume directory
    filepath_relative = PurePath(filepath).relative_to(settings.CONSUMPTION_DIR)
@@ -85,15 +81,15 @@ def _is_ignored(filepath: str) -> bool:
    return False


-def _consume(filepath: str) -> None:
-    if os.path.isdir(filepath) or _is_ignored(filepath):
+def _consume(filepath: Path) -> None:
+    if filepath.is_dir() or _is_ignored(filepath):
        return

-    if not os.path.isfile(filepath):
+    if not filepath.is_file():
        logger.debug(f"Not consuming file {filepath}: File has moved.")
        return

-    if not is_file_ext_supported(os.path.splitext(filepath)[1]):
+    if not is_file_ext_supported(filepath.suffix):
        logger.warning(f"Not consuming file {filepath}: Unknown file extension.")
        return

@@ -107,7 +103,7 @@ def _consume(filepath: str) -> None:

    while (read_try_count < os_error_retry_count) and not file_open_ok:
        try:
-            with open(filepath, "rb"):
+            with filepath.open("rb"):
                file_open_ok = True
        except OSError as e:
            read_try_count += 1
@@ -141,7 +137,7 @@ def _consume(filepath: str) -> None:
        logger.exception("Error while consuming document")


-def _consume_wait_unmodified(file: str) -> None:
+def _consume_wait_unmodified(file: Path) -> None:
    """
    Waits for the given file to appear unmodified based on file size
    and modification time.  Will wait a configured number of seconds
@@ -157,7 +153,7 @@ def _consume_wait_unmodified(file: str) -> None:
    current_try = 0
    while current_try < settings.CONSUMER_POLLING_RETRY_COUNT:
        try:
-            stat_data = os.stat(file)
+            stat_data = file.stat()
            new_mtime = stat_data.st_mtime
            new_size = stat_data.st_size
        except FileNotFoundError:
@@ -182,10 +178,10 @@ class Handler(FileSystemEventHandler):
        self._pool = pool

    def on_created(self, event):
-        self._pool.submit(_consume_wait_unmodified, event.src_path)
+        self._pool.submit(_consume_wait_unmodified, Path(event.src_path))

    def on_moved(self, event):
-        self._pool.submit(_consume_wait_unmodified, event.dest_path)
+        self._pool.submit(_consume_wait_unmodified, Path(event.dest_path))


 class Command(BaseCommand):
@@ -227,9 +223,9 @@ class Command(BaseCommand):
        if not directory:
            raise CommandError("CONSUMPTION_DIR does not appear to be set.")

-        directory = os.path.abspath(directory)
+        directory = Path(directory).resolve()

-        if not os.path.isdir(directory):
+        if not directory.is_dir():
            raise CommandError(f"Consumption directory {directory} does not exist")

        # Consumer will need this
@@ -238,11 +234,11 @@ class Command(BaseCommand):
        if recursive:
            for dirpath, _, filenames in os.walk(directory):
                for filename in filenames:
-                    filepath = os.path.join(dirpath, filename)
+                    filepath = Path(dirpath) / filename
                    _consume(filepath)
        else:
-            for entry in os.scandir(directory):
-                _consume(entry.path)
+            for filepath in directory.iterdir():
+                _consume(filepath)

        if options["oneshot"]:
            return
@@ -310,7 +306,7 @@ class Command(BaseCommand):
                try:
                    for event in inotify.read(timeout=timeout_ms):
                        path = inotify.get_path(event.wd) if recursive else directory
-                        filepath = os.path.join(path, event.name)
+                        filepath = Path(path) / event.name
                        if flags.MODIFY in flags.from_mask(event.mask):
                            notified_files.pop(filepath, None)
                        else:
@@ -327,9 +323,7 @@ class Command(BaseCommand):

                        # Also make sure the file exists still, some scanners might write a
                        # temporary file first
-                        file_still_exists = os.path.exists(filepath) and os.path.isfile(
-                            filepath,
-                        )
+                        file_still_exists = filepath.exists() and filepath.is_file()

                        if waited_long_enough and file_still_exists:
                            _consume(filepath)
--- a/src/documents/migrations/1012_fix_archive_files.py
+++ b/src/documents/migrations/1012_fix_archive_files.py
@@ -5,6 +5,7 @@ import logging
 import os
 import shutil
 from collections import defaultdict
+from pathlib import Path
 from time import sleep

 import pathvalidate
@@ -50,38 +51,38 @@ def many_to_dictionary(field):  # pragma: no cover
    return mydictionary


-def archive_name_from_filename(filename):
-    return os.path.splitext(filename)[0] + ".pdf"
+def archive_name_from_filename(filename: Path) -> Path:
+    return Path(filename.stem + ".pdf")


-def archive_path_old(doc):
+def archive_path_old(doc) -> Path:
    if doc.filename:
-        fname = archive_name_from_filename(doc.filename)
+        fname = archive_name_from_filename(Path(doc.filename))
    else:
-        fname = f"{doc.pk:07}.pdf"
+        fname = Path(f"{doc.pk:07}.pdf")

-    return os.path.join(settings.ARCHIVE_DIR, fname)
+    return settings.ARCHIVE_DIR / fname


 STORAGE_TYPE_GPG = "gpg"


-def archive_path_new(doc):
+def archive_path_new(doc) -> Path | None:
    if doc.archive_filename is not None:
-        return os.path.join(settings.ARCHIVE_DIR, str(doc.archive_filename))
+        return settings.ARCHIVE_DIR / doc.archive_filename
    else:
        return None


-def source_path(doc):
+def source_path(doc) -> Path:
    if doc.filename:
-        fname = str(doc.filename)
+        fname = doc.filename
    else:
        fname = f"{doc.pk:07}{doc.file_type}"
        if doc.storage_type == STORAGE_TYPE_GPG:
-            fname += ".gpg"  # pragma: no cover
+            fname = Path(str(fname) + ".gpg")  # pragma: no cover

-    return os.path.join(settings.ORIGINALS_DIR, fname)
+    return settings.ORIGINALS_DIR / fname


 def generate_unique_filename(doc, *, archive_filename=False):
@@ -104,7 +105,7 @@ def generate_unique_filename(doc, *, archive_filename=False):
            # still the same as before.
            return new_filename

-        if os.path.exists(os.path.join(root, new_filename)):
+        if (root / new_filename).exists():
            counter += 1
        else:
            return new_filename
@@ -202,18 +203,18 @@ def create_archive_version(doc, retry_count=3):
                parser,
                source_path(doc),
                doc.mime_type,
-                os.path.basename(doc.filename),
+                Path(doc.filename).name,
            )
            doc.content = parser.get_text()

-            if parser.get_archive_path() and os.path.isfile(parser.get_archive_path()):
+            if parser.get_archive_path() and Path(parser.get_archive_path()).is_file():
                doc.archive_filename = generate_unique_filename(
                    doc,
                    archive_filename=True,
                )
-                with open(parser.get_archive_path(), "rb") as f:
+                with Path(parser.get_archive_path()).open("rb") as f:
                    doc.archive_checksum = hashlib.md5(f.read()).hexdigest()
-                os.makedirs(os.path.dirname(archive_path_new(doc)), exist_ok=True)
+                archive_path_new(doc).parent.mkdir(parents=True, exist_ok=True)
                shutil.copy2(parser.get_archive_path(), archive_path_new(doc))
            else:
                doc.archive_checksum = None
@@ -264,7 +265,7 @@ def move_old_to_new_locations(apps, schema_editor):
    # check that archive files of all unaffected documents are in place
    for doc in Document.objects.filter(archive_checksum__isnull=False):
        old_path = archive_path_old(doc)
-        if doc.id not in affected_document_ids and not os.path.isfile(old_path):
+        if doc.id not in affected_document_ids and not old_path.is_file():
            raise ValueError(
                f"Archived document ID:{doc.id} does not exist at: {old_path}",
            )
@@ -285,12 +286,12 @@ def move_old_to_new_locations(apps, schema_editor):
        if doc.id in affected_document_ids:
            old_path = archive_path_old(doc)
            # remove affected archive versions
-            if os.path.isfile(old_path):
+            if old_path.is_file():
                logger.debug(f"Removing {old_path}")
-                os.unlink(old_path)
+                old_path.unlink()
        else:
            # Set archive path for unaffected files
-            doc.archive_filename = archive_name_from_filename(doc.filename)
+            doc.archive_filename = archive_name_from_filename(Path(doc.filename))
            Document.objects.filter(id=doc.id).update(
                archive_filename=doc.archive_filename,
            )
@@ -316,7 +317,7 @@ def move_new_to_old_locations(apps, schema_editor):
                f"filename.",
            )
        old_archive_paths.add(old_archive_path)
-        if new_archive_path != old_archive_path and os.path.isfile(old_archive_path):
+        if new_archive_path != old_archive_path and old_archive_path.is_file():
            raise ValueError(
                f"Cannot migrate: Cannot move {new_archive_path} to "
                f"{old_archive_path}: file already exists.",
--- a/src/documents/parsers.py
+++ b/src/documents/parsers.py
@@ -169,7 +169,7 @@ def run_convert(
    args += ["-depth", str(depth)] if depth else []
    args += ["-auto-orient"] if auto_orient else []
    args += ["-define", "pdf:use-cropbox=true"] if use_cropbox else []
-    args += [input_file, output_file]
+    args += [str(input_file), str(output_file)]

    logger.debug("Execute: " + " ".join(args), extra={"group": logging_group})

@@ -188,8 +188,8 @@ def get_default_thumbnail() -> Path:
    return (Path(__file__).parent / "resources" / "document.webp").resolve()


-def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> str:
-    out_path = os.path.join(temp_dir, "convert_gs.webp")
+def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> Path:
+    out_path: Path = Path(temp_dir) / "convert_gs.webp"

    # if convert fails, fall back to extracting
    # the first PDF page as a PNG using Ghostscript
@@ -199,7 +199,7 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -
        extra={"group": logging_group},
    )
    # Ghostscript doesn't handle WebP outputs
-    gs_out_path = os.path.join(temp_dir, "gs_out.png")
+    gs_out_path: Path = Path(temp_dir) / "gs_out.png"
    cmd = [settings.GS_BINARY, "-q", "-sDEVICE=pngalpha", "-o", gs_out_path, in_path]

    try:
@@ -227,16 +227,16 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -
        # The caller might expect a generated thumbnail that can be moved,
        # so we need to copy it before it gets moved.
        # https://github.com/paperless-ngx/paperless-ngx/issues/3631
-        default_thumbnail_path = os.path.join(temp_dir, "document.webp")
+        default_thumbnail_path: Path = Path(temp_dir) / "document.webp"
        copy_file_with_basic_stats(get_default_thumbnail(), default_thumbnail_path)
        return default_thumbnail_path


-def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> Path:
+def make_thumbnail_from_pdf(in_path: Path, temp_dir: Path, logging_group=None) -> Path:
    """
    The thumbnail of a PDF is just a 500px wide image of the first page.
    """
-    out_path = temp_dir / "convert.webp"
+    out_path: Path = temp_dir / "convert.webp"

    # Run convert to get a decent thumbnail
    try:
--- a/src/documents/tests/test_classifier.py
+++ b/src/documents/tests/test_classifier.py
@@ -654,7 +654,7 @@ class TestClassifier(DirectoriesMixin, TestCase):
        },
    )
    @override_settings(
-        MODEL_FILE=(Path(__file__).parent / "data" / "model.pickle").as_posix(),
+        MODEL_FILE=str(Path(__file__).parent / "data" / "model.pickle"),
    )
    @pytest.mark.skip(
        reason="Disabled caching due to high memory usage - need to investigate.",
--- a/src/documents/tests/test_consumer.py
+++ b/src/documents/tests/test_consumer.py
@@ -254,7 +254,7 @@ class TestConsumer(
        # https://github.com/jonaswinkler/paperless-ng/discussions/1037

        filename = self.get_test_file()
-        shadow_file = Path(self.dirs.scratch_dir / "._sample.pdf")
+        shadow_file = Path(self.dirs.scratch_dir) / "._sample.pdf"

        shutil.copy(filename, shadow_file)

--- a/src/documents/tests/test_management_consumer.py
+++ b/src/documents/tests/test_management_consumer.py
@@ -258,66 +258,66 @@ class TestConsumer(DirectoriesMixin, ConsumerThreadMixin, TransactionTestCase):
    def test_is_ignored(self):
        test_paths = [
            {
-                "path": (Path(self.dirs.consumption_dir) / "foo.pdf").as_posix(),
+                "path": str(Path(self.dirs.consumption_dir) / "foo.pdf"),
                "ignore": False,
            },
            {
-                "path": (
-                    Path(self.dirs.consumption_dir) / "foo" / "bar.pdf"
-                ).as_posix(),
+                "path": str(
+                    Path(self.dirs.consumption_dir) / "foo" / "bar.pdf",
+                ),
                "ignore": False,
            },
            {
-                "path": (Path(self.dirs.consumption_dir) / ".DS_STORE").as_posix(),
+                "path": str(Path(self.dirs.consumption_dir) / ".DS_STORE"),
                "ignore": True,
            },
            {
-                "path": (Path(self.dirs.consumption_dir) / ".DS_Store").as_posix(),
+                "path": str(Path(self.dirs.consumption_dir) / ".DS_Store"),
                "ignore": True,
            },
            {
-                "path": (
-                    Path(self.dirs.consumption_dir) / ".stfolder" / "foo.pdf"
-                ).as_posix(),
+                "path": str(
+                    Path(self.dirs.consumption_dir) / ".stfolder" / "foo.pdf",
+                ),
                "ignore": True,
            },
            {
-                "path": (Path(self.dirs.consumption_dir) / ".stfolder.pdf").as_posix(),
+                "path": str(Path(self.dirs.consumption_dir) / ".stfolder.pdf"),
                "ignore": False,
            },
            {
-                "path": (
-                    Path(self.dirs.consumption_dir) / ".stversions" / "foo.pdf"
-                ).as_posix(),
+                "path": str(
+                    Path(self.dirs.consumption_dir) / ".stversions" / "foo.pdf",
+                ),
                "ignore": True,
            },
            {
-                "path": (
-                    Path(self.dirs.consumption_dir) / ".stversions.pdf"
-                ).as_posix(),
+                "path": str(
+                    Path(self.dirs.consumption_dir) / ".stversions.pdf",
+                ),
                "ignore": False,
            },
            {
-                "path": (Path(self.dirs.consumption_dir) / "._foo.pdf").as_posix(),
+                "path": str(Path(self.dirs.consumption_dir) / "._foo.pdf"),
                "ignore": True,
            },
            {
-                "path": (Path(self.dirs.consumption_dir) / "my_foo.pdf").as_posix(),
+                "path": str(Path(self.dirs.consumption_dir) / "my_foo.pdf"),
                "ignore": False,
            },
            {
-                "path": (
-                    Path(self.dirs.consumption_dir) / "._foo" / "bar.pdf"
-                ).as_posix(),
+                "path": str(
+                    Path(self.dirs.consumption_dir) / "._foo" / "bar.pdf",
+                ),
                "ignore": True,
            },
            {
-                "path": (
+                "path": str(
                    Path(self.dirs.consumption_dir)
                    / "@eaDir"
                    / "SYNO@.fileindexdb"
-                    / "_1jk.fnm"
-                ).as_posix(),
+                    / "_1jk.fnm",
+                ),
                "ignore": True,
            },
        ]
@@ -330,7 +330,7 @@ class TestConsumer(DirectoriesMixin, ConsumerThreadMixin, TransactionTestCase):
                f'_is_ignored("{filepath}") != {expected_ignored_result}',
            )

-    @mock.patch("documents.management.commands.document_consumer.open")
+    @mock.patch("documents.management.commands.document_consumer.Path.open")
    def test_consume_file_busy(self, open_mock):
        # Calling this mock always raises this
        open_mock.side_effect = OSError
--- a/src/documents/tests/test_management_exporter.py
+++ b/src/documents/tests/test_management_exporter.py
@@ -230,9 +230,9 @@ class TestExportImport(

        for element in manifest:
            if element["model"] == "documents.document":
-                fname = (
-                    self.target / element[document_exporter.EXPORTER_FILE_NAME]
-                ).as_posix()
+                fname = str(
+                    self.target / element[document_exporter.EXPORTER_FILE_NAME],
+                )
                self.assertIsFile(fname)
                self.assertIsFile(
                    self.target / element[document_exporter.EXPORTER_THUMBNAIL_NAME],
@@ -462,9 +462,9 @@ class TestExportImport(

        call_command(*args)

-        expected_file = (
-            self.target / f"export-{timezone.localdate().isoformat()}.zip"
-        ).as_posix()
+        expected_file = str(
+            self.target / f"export-{timezone.localdate().isoformat()}.zip",
+        )

        self.assertIsFile(expected_file)

@@ -498,9 +498,9 @@ class TestExportImport(
        ):
            call_command(*args)

-        expected_file = (
-            self.target / f"export-{timezone.localdate().isoformat()}.zip"
-        ).as_posix()
+        expected_file = str(
+            self.target / f"export-{timezone.localdate().isoformat()}.zip",
+        )

        self.assertIsFile(expected_file)

@@ -544,9 +544,9 @@ class TestExportImport(

        call_command(*args)

-        expected_file = (
-            self.target / f"export-{timezone.localdate().isoformat()}.zip"
-        ).as_posix()
+        expected_file = str(
+            self.target / f"export-{timezone.localdate().isoformat()}.zip",
+        )

        self.assertIsFile(expected_file)
        self.assertIsNotFile(existing_file)
--- a/src/documents/tests/test_migration_archive_files.py
+++ b/src/documents/tests/test_migration_archive_files.py
@@ -19,15 +19,15 @@ migration_1012_obj = importlib.import_module(
 )


-def archive_name_from_filename(filename):
-    return Path(filename).stem + ".pdf"
+def archive_name_from_filename(filename: Path) -> Path:
+    return Path(filename.stem + ".pdf")


-def archive_path_old(self):
+def archive_path_old(self) -> Path:
    if self.filename:
-        fname = archive_name_from_filename(self.filename)
+        fname = archive_name_from_filename(Path(self.filename))
    else:
-        fname = f"{self.pk:07}.pdf"
+        fname = Path(f"{self.pk:07}.pdf")

    return Path(settings.ARCHIVE_DIR) / fname