From d2064a2535d5b195ca652686891b7ba91cbccda8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Steinbei=C3=9Fer?= <33968289+gothicVI@users.noreply.github.com> Date: Wed, 3 Sep 2025 17:12:41 +0200 Subject: [PATCH] Chore: switch from os.path to pathlib.Path (#10539) --- pyproject.toml | 9 -- .../management/commands/document_consumer.py | 44 ++++---- .../migrations/1012_fix_archive_files.py | 45 ++++---- src/documents/parsers.py | 14 +-- src/documents/tests/test_classifier.py | 2 +- src/documents/tests/test_consumer.py | 2 +- .../tests/test_management_consumer.py | 50 ++++----- .../tests/test_management_exporter.py | 24 ++-- .../tests/test_migration_archive_files.py | 10 +- src/paperless/settings.py | 12 +- src/paperless_tesseract/tests/test_parser.py | 104 +++++++++--------- 11 files changed, 151 insertions(+), 165 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d1e729193..7ae88a678 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -205,18 +205,9 @@ lint.per-file-ignores."docker/wait-for-redis.py" = [ "INP001", "T201", ] -lint.per-file-ignores."src/documents/management/commands/document_consumer.py" = [ - "PTH", -] # TODO Enable & remove -lint.per-file-ignores."src/documents/migrations/1012_fix_archive_files.py" = [ - "PTH", -] # TODO Enable & remove lint.per-file-ignores."src/documents/models.py" = [ "SIM115", ] -lint.per-file-ignores."src/documents/parsers.py" = [ - "PTH", -] # TODO Enable & remove lint.per-file-ignores."src/paperless_tesseract/tests/test_parser.py" = [ "RUF001", ] diff --git a/src/documents/management/commands/document_consumer.py b/src/documents/management/commands/document_consumer.py index 1e98533f0..35d79288e 100644 --- a/src/documents/management/commands/document_consumer.py +++ b/src/documents/management/commands/document_consumer.py @@ -32,7 +32,7 @@ except ImportError: # pragma: no cover logger = logging.getLogger("paperless.management.consumer") -def _tags_from_path(filepath) -> list[int]: +def _tags_from_path(filepath: Path) -> list[int]: """ Walk up the directory tree from filepath to CONSUMPTION_DIR and get or create Tag IDs for every directory. @@ -41,7 +41,7 @@ def _tags_from_path(filepath) -> list[int]: """ db.close_old_connections() tag_ids = set() - path_parts = Path(filepath).relative_to(settings.CONSUMPTION_DIR).parent.parts + path_parts = filepath.relative_to(settings.CONSUMPTION_DIR).parent.parts for part in path_parts: tag_ids.add( Tag.objects.get_or_create(name__iexact=part, defaults={"name": part})[0].pk, @@ -50,17 +50,13 @@ def _tags_from_path(filepath) -> list[int]: return list(tag_ids) -def _is_ignored(filepath: str) -> bool: +def _is_ignored(filepath: Path) -> bool: """ Checks if the given file should be ignored, based on configured patterns. Returns True if the file is ignored, False otherwise """ - filepath = os.path.abspath( - os.path.normpath(filepath), - ) - # Trim out the consume directory, leaving only filename and it's # path relative to the consume directory filepath_relative = PurePath(filepath).relative_to(settings.CONSUMPTION_DIR) @@ -85,15 +81,15 @@ def _is_ignored(filepath: str) -> bool: return False -def _consume(filepath: str) -> None: - if os.path.isdir(filepath) or _is_ignored(filepath): +def _consume(filepath: Path) -> None: + if filepath.is_dir() or _is_ignored(filepath): return - if not os.path.isfile(filepath): + if not filepath.is_file(): logger.debug(f"Not consuming file {filepath}: File has moved.") return - if not is_file_ext_supported(os.path.splitext(filepath)[1]): + if not is_file_ext_supported(filepath.suffix): logger.warning(f"Not consuming file {filepath}: Unknown file extension.") return @@ -107,7 +103,7 @@ def _consume(filepath: str) -> None: while (read_try_count < os_error_retry_count) and not file_open_ok: try: - with open(filepath, "rb"): + with filepath.open("rb"): file_open_ok = True except OSError as e: read_try_count += 1 @@ -141,7 +137,7 @@ def _consume(filepath: str) -> None: logger.exception("Error while consuming document") -def _consume_wait_unmodified(file: str) -> None: +def _consume_wait_unmodified(file: Path) -> None: """ Waits for the given file to appear unmodified based on file size and modification time. Will wait a configured number of seconds @@ -157,7 +153,7 @@ def _consume_wait_unmodified(file: str) -> None: current_try = 0 while current_try < settings.CONSUMER_POLLING_RETRY_COUNT: try: - stat_data = os.stat(file) + stat_data = file.stat() new_mtime = stat_data.st_mtime new_size = stat_data.st_size except FileNotFoundError: @@ -182,10 +178,10 @@ class Handler(FileSystemEventHandler): self._pool = pool def on_created(self, event): - self._pool.submit(_consume_wait_unmodified, event.src_path) + self._pool.submit(_consume_wait_unmodified, Path(event.src_path)) def on_moved(self, event): - self._pool.submit(_consume_wait_unmodified, event.dest_path) + self._pool.submit(_consume_wait_unmodified, Path(event.dest_path)) class Command(BaseCommand): @@ -227,9 +223,9 @@ class Command(BaseCommand): if not directory: raise CommandError("CONSUMPTION_DIR does not appear to be set.") - directory = os.path.abspath(directory) + directory = Path(directory).resolve() - if not os.path.isdir(directory): + if not directory.is_dir(): raise CommandError(f"Consumption directory {directory} does not exist") # Consumer will need this @@ -238,11 +234,11 @@ class Command(BaseCommand): if recursive: for dirpath, _, filenames in os.walk(directory): for filename in filenames: - filepath = os.path.join(dirpath, filename) + filepath = Path(dirpath) / filename _consume(filepath) else: - for entry in os.scandir(directory): - _consume(entry.path) + for filepath in directory.iterdir(): + _consume(filepath) if options["oneshot"]: return @@ -310,7 +306,7 @@ class Command(BaseCommand): try: for event in inotify.read(timeout=timeout_ms): path = inotify.get_path(event.wd) if recursive else directory - filepath = os.path.join(path, event.name) + filepath = Path(path) / event.name if flags.MODIFY in flags.from_mask(event.mask): notified_files.pop(filepath, None) else: @@ -327,9 +323,7 @@ class Command(BaseCommand): # Also make sure the file exists still, some scanners might write a # temporary file first - file_still_exists = os.path.exists(filepath) and os.path.isfile( - filepath, - ) + file_still_exists = filepath.exists() and filepath.is_file() if waited_long_enough and file_still_exists: _consume(filepath) diff --git a/src/documents/migrations/1012_fix_archive_files.py b/src/documents/migrations/1012_fix_archive_files.py index 46951471e..a97fa7a80 100644 --- a/src/documents/migrations/1012_fix_archive_files.py +++ b/src/documents/migrations/1012_fix_archive_files.py @@ -5,6 +5,7 @@ import logging import os import shutil from collections import defaultdict +from pathlib import Path from time import sleep import pathvalidate @@ -50,38 +51,38 @@ def many_to_dictionary(field): # pragma: no cover return mydictionary -def archive_name_from_filename(filename): - return os.path.splitext(filename)[0] + ".pdf" +def archive_name_from_filename(filename: Path) -> Path: + return Path(filename.stem + ".pdf") -def archive_path_old(doc): +def archive_path_old(doc) -> Path: if doc.filename: - fname = archive_name_from_filename(doc.filename) + fname = archive_name_from_filename(Path(doc.filename)) else: - fname = f"{doc.pk:07}.pdf" + fname = Path(f"{doc.pk:07}.pdf") - return os.path.join(settings.ARCHIVE_DIR, fname) + return settings.ARCHIVE_DIR / fname STORAGE_TYPE_GPG = "gpg" -def archive_path_new(doc): +def archive_path_new(doc) -> Path | None: if doc.archive_filename is not None: - return os.path.join(settings.ARCHIVE_DIR, str(doc.archive_filename)) + return settings.ARCHIVE_DIR / doc.archive_filename else: return None -def source_path(doc): +def source_path(doc) -> Path: if doc.filename: - fname = str(doc.filename) + fname = doc.filename else: fname = f"{doc.pk:07}{doc.file_type}" if doc.storage_type == STORAGE_TYPE_GPG: - fname += ".gpg" # pragma: no cover + fname = Path(str(fname) + ".gpg") # pragma: no cover - return os.path.join(settings.ORIGINALS_DIR, fname) + return settings.ORIGINALS_DIR / fname def generate_unique_filename(doc, *, archive_filename=False): @@ -104,7 +105,7 @@ def generate_unique_filename(doc, *, archive_filename=False): # still the same as before. return new_filename - if os.path.exists(os.path.join(root, new_filename)): + if (root / new_filename).exists(): counter += 1 else: return new_filename @@ -202,18 +203,18 @@ def create_archive_version(doc, retry_count=3): parser, source_path(doc), doc.mime_type, - os.path.basename(doc.filename), + Path(doc.filename).name, ) doc.content = parser.get_text() - if parser.get_archive_path() and os.path.isfile(parser.get_archive_path()): + if parser.get_archive_path() and Path(parser.get_archive_path()).is_file(): doc.archive_filename = generate_unique_filename( doc, archive_filename=True, ) - with open(parser.get_archive_path(), "rb") as f: + with Path(parser.get_archive_path()).open("rb") as f: doc.archive_checksum = hashlib.md5(f.read()).hexdigest() - os.makedirs(os.path.dirname(archive_path_new(doc)), exist_ok=True) + archive_path_new(doc).parent.mkdir(parents=True, exist_ok=True) shutil.copy2(parser.get_archive_path(), archive_path_new(doc)) else: doc.archive_checksum = None @@ -264,7 +265,7 @@ def move_old_to_new_locations(apps, schema_editor): # check that archive files of all unaffected documents are in place for doc in Document.objects.filter(archive_checksum__isnull=False): old_path = archive_path_old(doc) - if doc.id not in affected_document_ids and not os.path.isfile(old_path): + if doc.id not in affected_document_ids and not old_path.is_file(): raise ValueError( f"Archived document ID:{doc.id} does not exist at: {old_path}", ) @@ -285,12 +286,12 @@ def move_old_to_new_locations(apps, schema_editor): if doc.id in affected_document_ids: old_path = archive_path_old(doc) # remove affected archive versions - if os.path.isfile(old_path): + if old_path.is_file(): logger.debug(f"Removing {old_path}") - os.unlink(old_path) + old_path.unlink() else: # Set archive path for unaffected files - doc.archive_filename = archive_name_from_filename(doc.filename) + doc.archive_filename = archive_name_from_filename(Path(doc.filename)) Document.objects.filter(id=doc.id).update( archive_filename=doc.archive_filename, ) @@ -316,7 +317,7 @@ def move_new_to_old_locations(apps, schema_editor): f"filename.", ) old_archive_paths.add(old_archive_path) - if new_archive_path != old_archive_path and os.path.isfile(old_archive_path): + if new_archive_path != old_archive_path and old_archive_path.is_file(): raise ValueError( f"Cannot migrate: Cannot move {new_archive_path} to " f"{old_archive_path}: file already exists.", diff --git a/src/documents/parsers.py b/src/documents/parsers.py index b1f7061f8..f6417e285 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -169,7 +169,7 @@ def run_convert( args += ["-depth", str(depth)] if depth else [] args += ["-auto-orient"] if auto_orient else [] args += ["-define", "pdf:use-cropbox=true"] if use_cropbox else [] - args += [input_file, output_file] + args += [str(input_file), str(output_file)] logger.debug("Execute: " + " ".join(args), extra={"group": logging_group}) @@ -188,8 +188,8 @@ def get_default_thumbnail() -> Path: return (Path(__file__).parent / "resources" / "document.webp").resolve() -def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> str: - out_path = os.path.join(temp_dir, "convert_gs.webp") +def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> Path: + out_path: Path = Path(temp_dir) / "convert_gs.webp" # if convert fails, fall back to extracting # the first PDF page as a PNG using Ghostscript @@ -199,7 +199,7 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) - extra={"group": logging_group}, ) # Ghostscript doesn't handle WebP outputs - gs_out_path = os.path.join(temp_dir, "gs_out.png") + gs_out_path: Path = Path(temp_dir) / "gs_out.png" cmd = [settings.GS_BINARY, "-q", "-sDEVICE=pngalpha", "-o", gs_out_path, in_path] try: @@ -227,16 +227,16 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) - # The caller might expect a generated thumbnail that can be moved, # so we need to copy it before it gets moved. # https://github.com/paperless-ngx/paperless-ngx/issues/3631 - default_thumbnail_path = os.path.join(temp_dir, "document.webp") + default_thumbnail_path: Path = Path(temp_dir) / "document.webp" copy_file_with_basic_stats(get_default_thumbnail(), default_thumbnail_path) return default_thumbnail_path -def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> Path: +def make_thumbnail_from_pdf(in_path: Path, temp_dir: Path, logging_group=None) -> Path: """ The thumbnail of a PDF is just a 500px wide image of the first page. """ - out_path = temp_dir / "convert.webp" + out_path: Path = temp_dir / "convert.webp" # Run convert to get a decent thumbnail try: diff --git a/src/documents/tests/test_classifier.py b/src/documents/tests/test_classifier.py index b1317f70b..4f2ad85f5 100644 --- a/src/documents/tests/test_classifier.py +++ b/src/documents/tests/test_classifier.py @@ -654,7 +654,7 @@ class TestClassifier(DirectoriesMixin, TestCase): }, ) @override_settings( - MODEL_FILE=(Path(__file__).parent / "data" / "model.pickle").as_posix(), + MODEL_FILE=str(Path(__file__).parent / "data" / "model.pickle"), ) @pytest.mark.skip( reason="Disabled caching due to high memory usage - need to investigate.", diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index f0fdc02c7..6709155d9 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -254,7 +254,7 @@ class TestConsumer( # https://github.com/jonaswinkler/paperless-ng/discussions/1037 filename = self.get_test_file() - shadow_file = Path(self.dirs.scratch_dir / "._sample.pdf") + shadow_file = Path(self.dirs.scratch_dir) / "._sample.pdf" shutil.copy(filename, shadow_file) diff --git a/src/documents/tests/test_management_consumer.py b/src/documents/tests/test_management_consumer.py index 8f933d741..821fd82e0 100644 --- a/src/documents/tests/test_management_consumer.py +++ b/src/documents/tests/test_management_consumer.py @@ -258,66 +258,66 @@ class TestConsumer(DirectoriesMixin, ConsumerThreadMixin, TransactionTestCase): def test_is_ignored(self): test_paths = [ { - "path": (Path(self.dirs.consumption_dir) / "foo.pdf").as_posix(), + "path": str(Path(self.dirs.consumption_dir) / "foo.pdf"), "ignore": False, }, { - "path": ( - Path(self.dirs.consumption_dir) / "foo" / "bar.pdf" - ).as_posix(), + "path": str( + Path(self.dirs.consumption_dir) / "foo" / "bar.pdf", + ), "ignore": False, }, { - "path": (Path(self.dirs.consumption_dir) / ".DS_STORE").as_posix(), + "path": str(Path(self.dirs.consumption_dir) / ".DS_STORE"), "ignore": True, }, { - "path": (Path(self.dirs.consumption_dir) / ".DS_Store").as_posix(), + "path": str(Path(self.dirs.consumption_dir) / ".DS_Store"), "ignore": True, }, { - "path": ( - Path(self.dirs.consumption_dir) / ".stfolder" / "foo.pdf" - ).as_posix(), + "path": str( + Path(self.dirs.consumption_dir) / ".stfolder" / "foo.pdf", + ), "ignore": True, }, { - "path": (Path(self.dirs.consumption_dir) / ".stfolder.pdf").as_posix(), + "path": str(Path(self.dirs.consumption_dir) / ".stfolder.pdf"), "ignore": False, }, { - "path": ( - Path(self.dirs.consumption_dir) / ".stversions" / "foo.pdf" - ).as_posix(), + "path": str( + Path(self.dirs.consumption_dir) / ".stversions" / "foo.pdf", + ), "ignore": True, }, { - "path": ( - Path(self.dirs.consumption_dir) / ".stversions.pdf" - ).as_posix(), + "path": str( + Path(self.dirs.consumption_dir) / ".stversions.pdf", + ), "ignore": False, }, { - "path": (Path(self.dirs.consumption_dir) / "._foo.pdf").as_posix(), + "path": str(Path(self.dirs.consumption_dir) / "._foo.pdf"), "ignore": True, }, { - "path": (Path(self.dirs.consumption_dir) / "my_foo.pdf").as_posix(), + "path": str(Path(self.dirs.consumption_dir) / "my_foo.pdf"), "ignore": False, }, { - "path": ( - Path(self.dirs.consumption_dir) / "._foo" / "bar.pdf" - ).as_posix(), + "path": str( + Path(self.dirs.consumption_dir) / "._foo" / "bar.pdf", + ), "ignore": True, }, { - "path": ( + "path": str( Path(self.dirs.consumption_dir) / "@eaDir" / "SYNO@.fileindexdb" - / "_1jk.fnm" - ).as_posix(), + / "_1jk.fnm", + ), "ignore": True, }, ] @@ -330,7 +330,7 @@ class TestConsumer(DirectoriesMixin, ConsumerThreadMixin, TransactionTestCase): f'_is_ignored("{filepath}") != {expected_ignored_result}', ) - @mock.patch("documents.management.commands.document_consumer.open") + @mock.patch("documents.management.commands.document_consumer.Path.open") def test_consume_file_busy(self, open_mock): # Calling this mock always raises this open_mock.side_effect = OSError diff --git a/src/documents/tests/test_management_exporter.py b/src/documents/tests/test_management_exporter.py index 7415467de..a67e5e8c5 100644 --- a/src/documents/tests/test_management_exporter.py +++ b/src/documents/tests/test_management_exporter.py @@ -230,9 +230,9 @@ class TestExportImport( for element in manifest: if element["model"] == "documents.document": - fname = ( - self.target / element[document_exporter.EXPORTER_FILE_NAME] - ).as_posix() + fname = str( + self.target / element[document_exporter.EXPORTER_FILE_NAME], + ) self.assertIsFile(fname) self.assertIsFile( self.target / element[document_exporter.EXPORTER_THUMBNAIL_NAME], @@ -462,9 +462,9 @@ class TestExportImport( call_command(*args) - expected_file = ( - self.target / f"export-{timezone.localdate().isoformat()}.zip" - ).as_posix() + expected_file = str( + self.target / f"export-{timezone.localdate().isoformat()}.zip", + ) self.assertIsFile(expected_file) @@ -498,9 +498,9 @@ class TestExportImport( ): call_command(*args) - expected_file = ( - self.target / f"export-{timezone.localdate().isoformat()}.zip" - ).as_posix() + expected_file = str( + self.target / f"export-{timezone.localdate().isoformat()}.zip", + ) self.assertIsFile(expected_file) @@ -544,9 +544,9 @@ class TestExportImport( call_command(*args) - expected_file = ( - self.target / f"export-{timezone.localdate().isoformat()}.zip" - ).as_posix() + expected_file = str( + self.target / f"export-{timezone.localdate().isoformat()}.zip", + ) self.assertIsFile(expected_file) self.assertIsNotFile(existing_file) diff --git a/src/documents/tests/test_migration_archive_files.py b/src/documents/tests/test_migration_archive_files.py index e5ad44b9e..402897e2f 100644 --- a/src/documents/tests/test_migration_archive_files.py +++ b/src/documents/tests/test_migration_archive_files.py @@ -19,15 +19,15 @@ migration_1012_obj = importlib.import_module( ) -def archive_name_from_filename(filename): - return Path(filename).stem + ".pdf" +def archive_name_from_filename(filename: Path) -> Path: + return Path(filename.stem + ".pdf") -def archive_path_old(self): +def archive_path_old(self) -> Path: if self.filename: - fname = archive_name_from_filename(self.filename) + fname = archive_name_from_filename(Path(self.filename)) else: - fname = f"{self.pk:07}.pdf" + fname = Path(f"{self.pk:07}.pdf") return Path(settings.ARCHIVE_DIR) / fname diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 5e6e2a14e..37cf0ecfa 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -679,7 +679,7 @@ def _parse_db_settings() -> dict: databases = { "default": { "ENGINE": "django.db.backends.sqlite3", - "NAME": str(DATA_DIR / "db.sqlite3"), + "NAME": DATA_DIR / "db.sqlite3", "OPTIONS": {}, }, } @@ -807,7 +807,7 @@ LANGUAGES = [ ("zh-tw", _("Chinese Traditional")), ] -LOCALE_PATHS = [str(BASE_DIR / "locale")] +LOCALE_PATHS = [BASE_DIR / "locale"] TIME_ZONE = os.getenv("PAPERLESS_TIME_ZONE", "UTC") @@ -848,21 +848,21 @@ LOGGING = { "file_paperless": { "class": "concurrent_log_handler.ConcurrentRotatingFileHandler", "formatter": "verbose", - "filename": str(LOGGING_DIR / "paperless.log"), + "filename": LOGGING_DIR / "paperless.log", "maxBytes": LOGROTATE_MAX_SIZE, "backupCount": LOGROTATE_MAX_BACKUPS, }, "file_mail": { "class": "concurrent_log_handler.ConcurrentRotatingFileHandler", "formatter": "verbose", - "filename": str(LOGGING_DIR / "mail.log"), + "filename": LOGGING_DIR / "mail.log", "maxBytes": LOGROTATE_MAX_SIZE, "backupCount": LOGROTATE_MAX_BACKUPS, }, "file_celery": { "class": "concurrent_log_handler.ConcurrentRotatingFileHandler", "formatter": "verbose", - "filename": str(LOGGING_DIR / "celery.log"), + "filename": LOGGING_DIR / "celery.log", "maxBytes": LOGROTATE_MAX_SIZE, "backupCount": LOGROTATE_MAX_BACKUPS, }, @@ -921,7 +921,7 @@ CELERY_ACCEPT_CONTENT = ["application/json", "application/x-python-serialize"] CELERY_BEAT_SCHEDULE = _parse_beat_schedule() # https://docs.celeryq.dev/en/stable/userguide/configuration.html#beat-schedule-filename -CELERY_BEAT_SCHEDULE_FILENAME = str(DATA_DIR / "celerybeat-schedule.db") +CELERY_BEAT_SCHEDULE_FILENAME = DATA_DIR / "celerybeat-schedule.db" # Cachalot: Database read cache. diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 09fc323ae..57f1c8157 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -69,13 +69,13 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ parser = RasterisedDocumentParser(uuid.uuid4()) page_count = parser.get_page_count( - (self.SAMPLE_FILES / "simple-digital.pdf").as_posix(), + str(self.SAMPLE_FILES / "simple-digital.pdf"), "application/pdf", ) self.assertEqual(page_count, 1) page_count = parser.get_page_count( - (self.SAMPLE_FILES / "multi-page-mixed.pdf").as_posix(), + str(self.SAMPLE_FILES / "multi-page-mixed.pdf"), "application/pdf", ) self.assertEqual(page_count, 6) @@ -92,7 +92,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): parser = RasterisedDocumentParser(uuid.uuid4()) with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm: page_count = parser.get_page_count( - (self.SAMPLE_FILES / "password-protected.pdf").as_posix(), + str(self.SAMPLE_FILES / "password-protected.pdf"), "application/pdf", ) self.assertEqual(page_count, None) @@ -101,7 +101,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): def test_thumbnail(self): parser = RasterisedDocumentParser(uuid.uuid4()) thumb = parser.get_thumbnail( - (self.SAMPLE_FILES / "simple-digital.pdf").as_posix(), + str(self.SAMPLE_FILES / "simple-digital.pdf"), "application/pdf", ) self.assertIsFile(thumb) @@ -109,7 +109,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): @mock.patch("documents.parsers.run_convert") def test_thumbnail_fallback(self, m): def call_convert(input_file, output_file, **kwargs): - if ".pdf" in input_file: + if ".pdf" in str(input_file): raise ParseError("Does not compute.") else: run_convert(input_file=input_file, output_file=output_file, **kwargs) @@ -118,7 +118,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): parser = RasterisedDocumentParser(uuid.uuid4()) thumb = parser.get_thumbnail( - (self.SAMPLE_FILES / "simple-digital.pdf").as_posix(), + str(self.SAMPLE_FILES / "simple-digital.pdf"), "application/pdf", ) self.assertIsFile(thumb) @@ -126,7 +126,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): def test_thumbnail_encrypted(self): parser = RasterisedDocumentParser(uuid.uuid4()) thumb = parser.get_thumbnail( - (self.SAMPLE_FILES / "encrypted.pdf").as_posix(), + str(self.SAMPLE_FILES / "encrypted.pdf"), "application/pdf", ) self.assertIsFile(thumb) @@ -134,17 +134,17 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): def test_get_dpi(self): parser = RasterisedDocumentParser(None) - dpi = parser.get_dpi((self.SAMPLE_FILES / "simple-no-dpi.png").as_posix()) + dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple-no-dpi.png")) self.assertEqual(dpi, None) - dpi = parser.get_dpi((self.SAMPLE_FILES / "simple.png").as_posix()) + dpi = parser.get_dpi(str(self.SAMPLE_FILES / "simple.png")) self.assertEqual(dpi, 72) def test_simple_digital(self): parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "simple-digital.pdf").as_posix(), + str(self.SAMPLE_FILES / "simple-digital.pdf"), "application/pdf", ) @@ -156,7 +156,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "with-form.pdf").as_posix(), + str(self.SAMPLE_FILES / "with-form.pdf"), "application/pdf", ) @@ -172,7 +172,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "with-form.pdf").as_posix(), + str(self.SAMPLE_FILES / "with-form.pdf"), "application/pdf", ) @@ -186,7 +186,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): def test_signed(self): parser = RasterisedDocumentParser(None) - parser.parse((self.SAMPLE_FILES / "signed.pdf").as_posix(), "application/pdf") + parser.parse(str(self.SAMPLE_FILES / "signed.pdf"), "application/pdf") self.assertIsNone(parser.archive_path) self.assertContainsStrings( @@ -202,7 +202,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "encrypted.pdf").as_posix(), + str(self.SAMPLE_FILES / "encrypted.pdf"), "application/pdf", ) @@ -213,7 +213,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): def test_with_form_error_notext(self): parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "with-form.pdf").as_posix(), + str(self.SAMPLE_FILES / "with-form.pdf"), "application/pdf", ) @@ -227,7 +227,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "with-form.pdf").as_posix(), + str(self.SAMPLE_FILES / "with-form.pdf"), "application/pdf", ) @@ -239,7 +239,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): def test_image_simple(self): parser = RasterisedDocumentParser(None) - parser.parse((self.SAMPLE_FILES / "simple.png").as_posix(), "image/png") + parser.parse(str(self.SAMPLE_FILES / "simple.png"), "image/png") self.assertIsFile(parser.archive_path) @@ -255,7 +255,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): dest_file = Path(tempdir) / "simple-alpha.png" shutil.copy(sample_file, dest_file) - parser.parse(dest_file.as_posix(), "image/png") + parser.parse(str(dest_file), "image/png") self.assertIsFile(parser.archive_path) @@ -265,7 +265,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): parser = RasterisedDocumentParser(None) dpi = parser.calculate_a4_dpi( - (self.SAMPLE_FILES / "simple-no-dpi.png").as_posix(), + str(self.SAMPLE_FILES / "simple-no-dpi.png"), ) self.assertEqual(dpi, 62) @@ -277,7 +277,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): def f(): parser.parse( - (self.SAMPLE_FILES / "simple-no-dpi.png").as_posix(), + str(self.SAMPLE_FILES / "simple-no-dpi.png"), "image/png", ) @@ -287,7 +287,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): def test_image_no_dpi_default(self): parser = RasterisedDocumentParser(None) - parser.parse((self.SAMPLE_FILES / "simple-no-dpi.png").as_posix(), "image/png") + parser.parse(str(self.SAMPLE_FILES / "simple-no-dpi.png"), "image/png") self.assertIsFile(parser.archive_path) @@ -299,7 +299,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): def test_multi_page(self): parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(), + str(self.SAMPLE_FILES / "multi-page-digital.pdf"), "application/pdf", ) self.assertIsFile(parser.archive_path) @@ -312,7 +312,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): def test_multi_page_pages_skip(self): parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(), + str(self.SAMPLE_FILES / "multi-page-digital.pdf"), "application/pdf", ) self.assertIsFile(parser.archive_path) @@ -325,7 +325,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): def test_multi_page_pages_redo(self): parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(), + str(self.SAMPLE_FILES / "multi-page-digital.pdf"), "application/pdf", ) self.assertIsFile(parser.archive_path) @@ -338,7 +338,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): def test_multi_page_pages_force(self): parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(), + str(self.SAMPLE_FILES / "multi-page-digital.pdf"), "application/pdf", ) self.assertIsFile(parser.archive_path) @@ -351,7 +351,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): def test_multi_page_analog_pages_skip(self): parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(), + str(self.SAMPLE_FILES / "multi-page-images.pdf"), "application/pdf", ) self.assertIsFile(parser.archive_path) @@ -375,7 +375,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(), + str(self.SAMPLE_FILES / "multi-page-images.pdf"), "application/pdf", ) self.assertIsFile(parser.archive_path) @@ -397,7 +397,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(), + str(self.SAMPLE_FILES / "multi-page-images.pdf"), "application/pdf", ) self.assertIsFile(parser.archive_path) @@ -419,7 +419,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(), + str(self.SAMPLE_FILES / "multi-page-digital.pdf"), "application/pdf", ) self.assertIsNone(parser.archive_path) @@ -442,7 +442,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(), + str(self.SAMPLE_FILES / "multi-page-images.pdf"), "application/pdf", ) @@ -467,7 +467,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(), + str(self.SAMPLE_FILES / "multi-page-digital.pdf"), "application/pdf", ) self.assertIsNotNone(parser.archive_path) @@ -490,7 +490,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(), + str(self.SAMPLE_FILES / "multi-page-images.pdf"), "application/pdf", ) self.assertIsNotNone(parser.archive_path) @@ -513,7 +513,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(), + str(self.SAMPLE_FILES / "multi-page-digital.pdf"), "application/pdf", ) self.assertIsNone(parser.archive_path) @@ -536,7 +536,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(), + str(self.SAMPLE_FILES / "multi-page-images.pdf"), "application/pdf", ) self.assertIsNotNone(parser.archive_path) @@ -559,7 +559,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(), + str(self.SAMPLE_FILES / "multi-page-digital.pdf"), "application/pdf", ) self.assertIsNone(parser.archive_path) @@ -582,7 +582,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(), + str(self.SAMPLE_FILES / "multi-page-images.pdf"), "application/pdf", ) self.assertIsNone(parser.archive_path) @@ -605,7 +605,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "multi-page-mixed.pdf").as_posix(), + str(self.SAMPLE_FILES / "multi-page-mixed.pdf"), "application/pdf", ) self.assertIsNotNone(parser.archive_path) @@ -636,7 +636,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "single-page-mixed.pdf").as_posix(), + str(self.SAMPLE_FILES / "single-page-mixed.pdf"), "application/pdf", ) self.assertIsNotNone(parser.archive_path) @@ -673,7 +673,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "multi-page-mixed.pdf").as_posix(), + str(self.SAMPLE_FILES / "multi-page-mixed.pdf"), "application/pdf", ) self.assertIsNone(parser.archive_path) @@ -685,7 +685,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): @override_settings(OCR_MODE="skip", OCR_ROTATE_PAGES=True) def test_rotate(self): parser = RasterisedDocumentParser(None) - parser.parse((self.SAMPLE_FILES / "rotated.pdf").as_posix(), "application/pdf") + parser.parse(str(self.SAMPLE_FILES / "rotated.pdf"), "application/pdf") self.assertContainsStrings( parser.get_text(), [ @@ -707,7 +707,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): """ parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "multi-page-images.tiff").as_posix(), + str(self.SAMPLE_FILES / "multi-page-images.tiff"), "image/tiff", ) self.assertIsFile(parser.archive_path) @@ -752,9 +752,9 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): - Text from all pages extracted """ parser = RasterisedDocumentParser(None) - sample_file = ( - self.SAMPLE_FILES / "multi-page-images-alpha-rgb.tiff" - ).as_posix() + sample_file = str( + self.SAMPLE_FILES / "multi-page-images-alpha-rgb.tiff", + ) with tempfile.NamedTemporaryFile() as tmp_file: shutil.copy(sample_file, tmp_file.name) parser.parse( @@ -843,7 +843,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "rtl-test.pdf").as_posix(), + str(self.SAMPLE_FILES / "rtl-test.pdf"), "application/pdf", ) @@ -858,7 +858,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase): self.assertRaises( ParseError, parser.parse, - (self.SAMPLE_FILES / "simple-digital.pdf").as_posix(), + str(self.SAMPLE_FILES / "simple-digital.pdf"), "application/pdf", ) @@ -868,32 +868,32 @@ class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase): def test_bmp(self): parser = RasterisedDocumentParser(None) - parser.parse((self.SAMPLE_FILES / "simple.bmp").as_posix(), "image/bmp") + parser.parse(str(self.SAMPLE_FILES / "simple.bmp"), "image/bmp") self.assertIsFile(parser.archive_path) self.assertIn("this is a test document", parser.get_text().lower()) def test_jpg(self): parser = RasterisedDocumentParser(None) - parser.parse((self.SAMPLE_FILES / "simple.jpg").as_posix(), "image/jpeg") + parser.parse(str(self.SAMPLE_FILES / "simple.jpg"), "image/jpeg") self.assertIsFile(parser.archive_path) self.assertIn("this is a test document", parser.get_text().lower()) def test_heic(self): parser = RasterisedDocumentParser(None) - parser.parse((self.SAMPLE_FILES / "simple.heic").as_posix(), "image/heic") + parser.parse(str(self.SAMPLE_FILES / "simple.heic"), "image/heic") self.assertIsFile(parser.archive_path) self.assertIn("pizza", parser.get_text().lower()) @override_settings(OCR_IMAGE_DPI=200) def test_gif(self): parser = RasterisedDocumentParser(None) - parser.parse((self.SAMPLE_FILES / "simple.gif").as_posix(), "image/gif") + parser.parse(str(self.SAMPLE_FILES / "simple.gif"), "image/gif") self.assertIsFile(parser.archive_path) self.assertIn("this is a test document", parser.get_text().lower()) def test_tiff(self): parser = RasterisedDocumentParser(None) - parser.parse((self.SAMPLE_FILES / "simple.tif").as_posix(), "image/tiff") + parser.parse(str(self.SAMPLE_FILES / "simple.tif"), "image/tiff") self.assertIsFile(parser.archive_path) self.assertIn("this is a test document", parser.get_text().lower()) @@ -901,7 +901,7 @@ class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase): def test_webp(self): parser = RasterisedDocumentParser(None) parser.parse( - (self.SAMPLE_FILES / "document.webp").as_posix(), + str(self.SAMPLE_FILES / "document.webp"), "image/webp", ) self.assertIsFile(parser.archive_path)