Chore: switch from os.path to pathlib.Path (#9933)

Co-authored-by: shamoon <4887959+shamoon@users.noreply.github.com>
This commit is contained in:
Sebastian Steinbeißer 2025-06-18 19:16:59 +02:00 committed by GitHub
parent cc5ba71f06
commit 07882b918b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 104 additions and 106 deletions

View File

@ -221,22 +221,12 @@ lint.per-file-ignores."src/documents/parsers.py" = [
lint.per-file-ignores."src/documents/signals/handlers.py" = [ lint.per-file-ignores."src/documents/signals/handlers.py" = [
"PTH", "PTH",
] # TODO Enable & remove ] # TODO Enable & remove
lint.per-file-ignores."src/documents/views.py" = [
"PTH",
] # TODO Enable & remove
lint.per-file-ignores."src/paperless/checks.py" = [
"PTH",
] # TODO Enable & remove
lint.per-file-ignores."src/paperless/settings.py" = [ lint.per-file-ignores."src/paperless/settings.py" = [
"PTH", "PTH",
] # TODO Enable & remove ] # TODO Enable & remove
lint.per-file-ignores."src/paperless_mail/mail.py" = [
"PTH",
] # TODO Enable & remove
lint.per-file-ignores."src/paperless_tesseract/tests/test_parser.py" = [ lint.per-file-ignores."src/paperless_tesseract/tests/test_parser.py" = [
"PTH",
"RUF001", "RUF001",
] # TODO PTH Enable & remove ]
lint.isort.force-single-line = true lint.isort.force-single-line = true
[tool.pytest.ini_options] [tool.pytest.ini_options]

View File

@ -650,7 +650,7 @@ class DocumentViewSet(
) )
def get_metadata(self, file, mime_type): def get_metadata(self, file, mime_type):
if not os.path.isfile(file): if not Path(file).is_file():
return None return None
parser_class = get_parser_class_for_mime_type(mime_type) parser_class = get_parser_class_for_mime_type(mime_type)
@ -668,8 +668,8 @@ class DocumentViewSet(
return [] return []
def get_filesize(self, filename): def get_filesize(self, filename):
if os.path.isfile(filename): if Path(filename).is_file():
return os.stat(filename).st_size return Path(filename).stat().st_size
else: else:
return None return None
@ -1215,31 +1215,37 @@ class UnifiedSearchViewSet(DocumentViewSet):
class LogViewSet(ViewSet): class LogViewSet(ViewSet):
permission_classes = (IsAuthenticated, PaperlessAdminPermissions) permission_classes = (IsAuthenticated, PaperlessAdminPermissions)
log_files = ["paperless", "mail", "celery"] ALLOWED_LOG_FILES = {
"paperless": "paperless.log",
"mail": "mail.log",
"celery": "celery.log",
}
def get_log_filename(self, log): def get_log_file(self, log_key: str) -> Path:
return os.path.join(settings.LOGGING_DIR, f"{log}.log") return Path(settings.LOGGING_DIR) / self.ALLOWED_LOG_FILES[log_key]
def retrieve(self, request, *args, **kwargs): def retrieve(self, request, *args, **kwargs):
log_file = kwargs.get("pk") log_key = kwargs.get("pk")
if log_file not in self.log_files: if log_key not in self.ALLOWED_LOG_FILES:
raise Http404 raise Http404
filename = self.get_log_filename(log_file) log_file = self.get_log_file(log_key)
if not os.path.isfile(filename): if not log_file.is_file():
raise Http404 raise Http404
with open(filename) as f: with log_file.open() as f:
lines = [line.rstrip() for line in f.readlines()] lines = [line.rstrip() for line in f.readlines()]
return Response(lines) return Response(lines)
def list(self, request, *args, **kwargs): def list(self, request, *args, **kwargs):
exist = [ existing_logs = [
log for log in self.log_files if os.path.isfile(self.get_log_filename(log)) log_key
for log_key in self.ALLOWED_LOG_FILES
if self.get_log_file(log_key).is_file()
] ]
return Response(exist) return Response(existing_logs)
class SavedViewViewSet(ModelViewSet, PassUserMixin): class SavedViewViewSet(ModelViewSet, PassUserMixin):
@ -2073,7 +2079,7 @@ class BulkDownloadView(GenericAPIView):
strategy.add_document(document) strategy.add_document(document)
# TODO(stumpylog): Investigate using FileResponse here # TODO(stumpylog): Investigate using FileResponse here
with open(temp.name, "rb") as f: with Path(temp.name).open("rb") as f:
response = HttpResponse(f, content_type="application/zip") response = HttpResponse(f, content_type="application/zip")
response["Content-Disposition"] = '{}; filename="{}"'.format( response["Content-Disposition"] = '{}; filename="{}"'.format(
"attachment", "attachment",

View File

@ -3,6 +3,7 @@ import os
import pwd import pwd
import shutil import shutil
import stat import stat
from pathlib import Path
from django.conf import settings from django.conf import settings
from django.core.checks import Error from django.core.checks import Error
@ -19,26 +20,23 @@ writeable_hint = (
) )
def path_check(var, directory): def path_check(var, directory: Path) -> list[Error]:
messages = [] messages: list[Error] = []
if directory: if directory:
if not os.path.isdir(directory): if not directory.is_dir():
messages.append( messages.append(
Error(exists_message.format(var), exists_hint.format(directory)), Error(exists_message.format(var), exists_hint.format(directory)),
) )
else: else:
test_file = os.path.join( test_file: Path = directory / f"__paperless_write_test_{os.getpid()}__"
directory,
f"__paperless_write_test_{os.getpid()}__",
)
try: try:
with open(test_file, "w"): with test_file.open("w"):
pass pass
except PermissionError: except PermissionError:
dir_stat = os.stat(directory) dir_stat: os.stat_result = Path(directory).stat()
dir_mode = stat.filemode(dir_stat.st_mode) dir_mode: str = stat.filemode(dir_stat.st_mode)
dir_owner = pwd.getpwuid(dir_stat.st_uid).pw_name dir_owner: str = pwd.getpwuid(dir_stat.st_uid).pw_name
dir_group = grp.getgrgid(dir_stat.st_gid).gr_name dir_group: str = grp.getgrgid(dir_stat.st_gid).gr_name
messages.append( messages.append(
Error( Error(
writeable_message.format(var), writeable_message.format(var),
@ -48,14 +46,18 @@ def path_check(var, directory):
), ),
) )
finally: finally:
if os.path.isfile(test_file): try:
os.remove(test_file) if test_file.is_file():
test_file.unlink()
except (PermissionError, OSError):
# Skip cleanup if we can't access the file — expected in permission tests
pass
return messages return messages
@register() @register()
def paths_check(app_configs, **kwargs): def paths_check(app_configs, **kwargs) -> list[Error]:
""" """
Check the various paths for existence, readability and writeability Check the various paths for existence, readability and writeability
""" """

View File

@ -27,9 +27,9 @@ class TestChecks(DirectoriesMixin, TestCase):
self.assertEqual(paths_check(None), []) self.assertEqual(paths_check(None), [])
@override_settings( @override_settings(
MEDIA_ROOT="uuh", MEDIA_ROOT=Path("uuh"),
DATA_DIR="whatever", DATA_DIR=Path("whatever"),
CONSUMPTION_DIR="idontcare", CONSUMPTION_DIR=Path("idontcare"),
) )
def test_paths_check_dont_exist(self): def test_paths_check_dont_exist(self):
msgs = paths_check(None) msgs = paths_check(None)

View File

@ -1,7 +1,6 @@
import datetime import datetime
import itertools import itertools
import logging import logging
import os
import ssl import ssl
import tempfile import tempfile
import traceback import traceback
@ -484,7 +483,7 @@ class MailAccountHandler(LoggingMixin):
return message.subject return message.subject
elif rule.assign_title_from == MailRule.TitleSource.FROM_FILENAME: elif rule.assign_title_from == MailRule.TitleSource.FROM_FILENAME:
return os.path.splitext(os.path.basename(att.filename))[0] return Path(att.filename).stem
elif rule.assign_title_from == MailRule.TitleSource.NONE: elif rule.assign_title_from == MailRule.TitleSource.NONE:
return None return None
@ -908,7 +907,7 @@ class MailAccountHandler(LoggingMixin):
dir=settings.SCRATCH_DIR, dir=settings.SCRATCH_DIR,
suffix=".eml", suffix=".eml",
) )
with open(temp_filename, "wb") as f: with Path(temp_filename).open("wb") as f:
# Move "From"-header to beginning of file # Move "From"-header to beginning of file
# TODO: This ugly workaround is needed because the parser is # TODO: This ugly workaround is needed because the parser is
# chosen only by the mime_type detected via magic # chosen only by the mime_type detected via magic

View File

@ -1,4 +1,3 @@
import os
import shutil import shutil
import tempfile import tempfile
import uuid import uuid
@ -70,13 +69,13 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(uuid.uuid4()) parser = RasterisedDocumentParser(uuid.uuid4())
page_count = parser.get_page_count( page_count = parser.get_page_count(
os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), (self.SAMPLE_FILES / "simple-digital.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertEqual(page_count, 1) self.assertEqual(page_count, 1)
page_count = parser.get_page_count( page_count = parser.get_page_count(
os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"), (self.SAMPLE_FILES / "multi-page-mixed.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertEqual(page_count, 6) self.assertEqual(page_count, 6)
@ -93,7 +92,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(uuid.uuid4()) parser = RasterisedDocumentParser(uuid.uuid4())
with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm: with self.assertLogs("paperless.parsing.tesseract", level="WARNING") as cm:
page_count = parser.get_page_count( page_count = parser.get_page_count(
os.path.join(self.SAMPLE_FILES, "password-protected.pdf"), (self.SAMPLE_FILES / "password-protected.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertEqual(page_count, None) self.assertEqual(page_count, None)
@ -102,7 +101,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_thumbnail(self): def test_thumbnail(self):
parser = RasterisedDocumentParser(uuid.uuid4()) parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail( thumb = parser.get_thumbnail(
os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), (self.SAMPLE_FILES / "simple-digital.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsFile(thumb) self.assertIsFile(thumb)
@ -119,7 +118,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(uuid.uuid4()) parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail( thumb = parser.get_thumbnail(
os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), (self.SAMPLE_FILES / "simple-digital.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsFile(thumb) self.assertIsFile(thumb)
@ -127,7 +126,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_thumbnail_encrypted(self): def test_thumbnail_encrypted(self):
parser = RasterisedDocumentParser(uuid.uuid4()) parser = RasterisedDocumentParser(uuid.uuid4())
thumb = parser.get_thumbnail( thumb = parser.get_thumbnail(
os.path.join(self.SAMPLE_FILES, "encrypted.pdf"), (self.SAMPLE_FILES / "encrypted.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsFile(thumb) self.assertIsFile(thumb)
@ -135,17 +134,17 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_get_dpi(self): def test_get_dpi(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png")) dpi = parser.get_dpi((self.SAMPLE_FILES / "simple-no-dpi.png").as_posix())
self.assertEqual(dpi, None) self.assertEqual(dpi, None)
dpi = parser.get_dpi(os.path.join(self.SAMPLE_FILES, "simple.png")) dpi = parser.get_dpi((self.SAMPLE_FILES / "simple.png").as_posix())
self.assertEqual(dpi, 72) self.assertEqual(dpi, 72)
def test_simple_digital(self): def test_simple_digital(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), (self.SAMPLE_FILES / "simple-digital.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
@ -157,7 +156,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "with-form.pdf"), (self.SAMPLE_FILES / "with-form.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
@ -173,7 +172,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "with-form.pdf"), (self.SAMPLE_FILES / "with-form.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
@ -187,7 +186,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_signed(self): def test_signed(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "signed.pdf"), "application/pdf") parser.parse((self.SAMPLE_FILES / "signed.pdf").as_posix(), "application/pdf")
self.assertIsNone(parser.archive_path) self.assertIsNone(parser.archive_path)
self.assertContainsStrings( self.assertContainsStrings(
@ -203,7 +202,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "encrypted.pdf"), (self.SAMPLE_FILES / "encrypted.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
@ -214,7 +213,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_with_form_error_notext(self): def test_with_form_error_notext(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "with-form.pdf"), (self.SAMPLE_FILES / "with-form.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
@ -228,7 +227,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "with-form.pdf"), (self.SAMPLE_FILES / "with-form.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
@ -240,7 +239,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_image_simple(self): def test_image_simple(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.png"), "image/png") parser.parse((self.SAMPLE_FILES / "simple.png").as_posix(), "image/png")
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@ -252,11 +251,11 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
with tempfile.TemporaryDirectory() as tempdir: with tempfile.TemporaryDirectory() as tempdir:
# Copy sample file to temp directory, as the parsing changes the file # Copy sample file to temp directory, as the parsing changes the file
# and this makes it modified to Git # and this makes it modified to Git
sample_file = os.path.join(self.SAMPLE_FILES, "simple-alpha.png") sample_file = self.SAMPLE_FILES / "simple-alpha.png"
dest_file = os.path.join(tempdir, "simple-alpha.png") dest_file = Path(tempdir) / "simple-alpha.png"
shutil.copy(sample_file, dest_file) shutil.copy(sample_file, dest_file)
parser.parse(dest_file, "image/png") parser.parse(dest_file.as_posix(), "image/png")
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@ -266,7 +265,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
dpi = parser.calculate_a4_dpi( dpi = parser.calculate_a4_dpi(
os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), (self.SAMPLE_FILES / "simple-no-dpi.png").as_posix(),
) )
self.assertEqual(dpi, 62) self.assertEqual(dpi, 62)
@ -278,7 +277,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def f(): def f():
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), (self.SAMPLE_FILES / "simple-no-dpi.png").as_posix(),
"image/png", "image/png",
) )
@ -288,7 +287,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_image_no_dpi_default(self): def test_image_no_dpi_default(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png") parser.parse((self.SAMPLE_FILES / "simple-no-dpi.png").as_posix(), "image/png")
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@ -300,7 +299,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_multi_page(self): def test_multi_page(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@ -313,7 +312,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_multi_page_pages_skip(self): def test_multi_page_pages_skip(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@ -326,7 +325,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_multi_page_pages_redo(self): def test_multi_page_pages_redo(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@ -339,7 +338,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_multi_page_pages_force(self): def test_multi_page_pages_force(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@ -352,7 +351,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
def test_multi_page_analog_pages_skip(self): def test_multi_page_analog_pages_skip(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@ -376,7 +375,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@ -398,7 +397,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@ -420,7 +419,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsNone(parser.archive_path) self.assertIsNone(parser.archive_path)
@ -443,7 +442,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
@ -468,7 +467,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsNotNone(parser.archive_path) self.assertIsNotNone(parser.archive_path)
@ -491,7 +490,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsNotNone(parser.archive_path) self.assertIsNotNone(parser.archive_path)
@ -514,7 +513,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsNone(parser.archive_path) self.assertIsNone(parser.archive_path)
@ -537,7 +536,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsNotNone(parser.archive_path) self.assertIsNotNone(parser.archive_path)
@ -560,7 +559,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), (self.SAMPLE_FILES / "multi-page-digital.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsNone(parser.archive_path) self.assertIsNone(parser.archive_path)
@ -583,7 +582,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), (self.SAMPLE_FILES / "multi-page-images.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsNone(parser.archive_path) self.assertIsNone(parser.archive_path)
@ -606,7 +605,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"), (self.SAMPLE_FILES / "multi-page-mixed.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsNotNone(parser.archive_path) self.assertIsNotNone(parser.archive_path)
@ -616,7 +615,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
["page 1", "page 2", "page 3", "page 4", "page 5", "page 6"], ["page 1", "page 2", "page 3", "page 4", "page 5", "page 6"],
) )
with open(os.path.join(parser.tempdir, "sidecar.txt")) as f: with (parser.tempdir / "sidecar.txt").open() as f:
sidecar = f.read() sidecar = f.read()
self.assertIn("[OCR skipped on page(s) 4-6]", sidecar) self.assertIn("[OCR skipped on page(s) 4-6]", sidecar)
@ -637,7 +636,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "single-page-mixed.pdf"), (self.SAMPLE_FILES / "single-page-mixed.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsNotNone(parser.archive_path) self.assertIsNotNone(parser.archive_path)
@ -651,7 +650,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
], ],
) )
with open(os.path.join(parser.tempdir, "sidecar.txt")) as f: with (parser.tempdir / "sidecar.txt").open() as f:
sidecar = f.read().lower() sidecar = f.read().lower()
self.assertIn("this is some text, but in an image, also on page 1.", sidecar) self.assertIn("this is some text, but in an image, also on page 1.", sidecar)
@ -674,7 +673,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"), (self.SAMPLE_FILES / "multi-page-mixed.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
self.assertIsNone(parser.archive_path) self.assertIsNone(parser.archive_path)
@ -686,7 +685,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
@override_settings(OCR_MODE="skip", OCR_ROTATE_PAGES=True) @override_settings(OCR_MODE="skip", OCR_ROTATE_PAGES=True)
def test_rotate(self): def test_rotate(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "rotated.pdf"), "application/pdf") parser.parse((self.SAMPLE_FILES / "rotated.pdf").as_posix(), "application/pdf")
self.assertContainsStrings( self.assertContainsStrings(
parser.get_text(), parser.get_text(),
[ [
@ -708,7 +707,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "multi-page-images.tiff"), (self.SAMPLE_FILES / "multi-page-images.tiff").as_posix(),
"image/tiff", "image/tiff",
) )
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
@ -728,7 +727,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
- Text from all pages extracted - Text from all pages extracted
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
sample_file = os.path.join(self.SAMPLE_FILES, "multi-page-images-alpha.tiff") sample_file = self.SAMPLE_FILES / "multi-page-images-alpha.tiff"
with tempfile.NamedTemporaryFile() as tmp_file: with tempfile.NamedTemporaryFile() as tmp_file:
shutil.copy(sample_file, tmp_file.name) shutil.copy(sample_file, tmp_file.name)
parser.parse( parser.parse(
@ -753,10 +752,9 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
- Text from all pages extracted - Text from all pages extracted
""" """
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
sample_file = os.path.join( sample_file = (
self.SAMPLE_FILES, self.SAMPLE_FILES / "multi-page-images-alpha-rgb.tiff"
"multi-page-images-alpha-rgb.tiff", ).as_posix()
)
with tempfile.NamedTemporaryFile() as tmp_file: with tempfile.NamedTemporaryFile() as tmp_file:
shutil.copy(sample_file, tmp_file.name) shutil.copy(sample_file, tmp_file.name)
parser.parse( parser.parse(
@ -845,7 +843,7 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse( parser.parse(
os.path.join(self.SAMPLE_FILES, "rtl-test.pdf"), (self.SAMPLE_FILES / "rtl-test.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
@ -860,49 +858,52 @@ class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
self.assertRaises( self.assertRaises(
ParseError, ParseError,
parser.parse, parser.parse,
os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), (self.SAMPLE_FILES / "simple-digital.pdf").as_posix(),
"application/pdf", "application/pdf",
) )
class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase): class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase):
SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") SAMPLE_FILES = Path(__file__).parent / "samples"
def test_bmp(self): def test_bmp(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.bmp"), "image/bmp") parser.parse((self.SAMPLE_FILES / "simple.bmp").as_posix(), "image/bmp")
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower()) self.assertIn("this is a test document", parser.get_text().lower())
def test_jpg(self): def test_jpg(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.jpg"), "image/jpeg") parser.parse((self.SAMPLE_FILES / "simple.jpg").as_posix(), "image/jpeg")
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower()) self.assertIn("this is a test document", parser.get_text().lower())
def test_heic(self): def test_heic(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.heic"), "image/heic") parser.parse((self.SAMPLE_FILES / "simple.heic").as_posix(), "image/heic")
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
self.assertIn("pizza", parser.get_text().lower()) self.assertIn("pizza", parser.get_text().lower())
@override_settings(OCR_IMAGE_DPI=200) @override_settings(OCR_IMAGE_DPI=200)
def test_gif(self): def test_gif(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.gif"), "image/gif") parser.parse((self.SAMPLE_FILES / "simple.gif").as_posix(), "image/gif")
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower()) self.assertIn("this is a test document", parser.get_text().lower())
def test_tiff(self): def test_tiff(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "simple.tif"), "image/tiff") parser.parse((self.SAMPLE_FILES / "simple.tif").as_posix(), "image/tiff")
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
self.assertIn("this is a test document", parser.get_text().lower()) self.assertIn("this is a test document", parser.get_text().lower())
@override_settings(OCR_IMAGE_DPI=72) @override_settings(OCR_IMAGE_DPI=72)
def test_webp(self): def test_webp(self):
parser = RasterisedDocumentParser(None) parser = RasterisedDocumentParser(None)
parser.parse(os.path.join(self.SAMPLE_FILES, "document.webp"), "image/webp") parser.parse(
(self.SAMPLE_FILES / "document.webp").as_posix(),
"image/webp",
)
self.assertIsFile(parser.archive_path) self.assertIsFile(parser.archive_path)
# Older tesseracts consistently mangle the space between "a webp", # Older tesseracts consistently mangle the space between "a webp",
# tesseract 5.3.0 seems to do a better job, so we're accepting both # tesseract 5.3.0 seems to do a better job, so we're accepting both