Creates a mix-in for asserting file system states

2025-12-16 01:31:09 -06:00 · 2023-02-19 18:00:45 -08:00
parent 1718cf6504
commit 0df91c31f1
14 changed files with 275 additions and 253 deletions
--- a/src/paperless_tesseract/tests/test_parser.py
+++ b/src/paperless_tesseract/tests/test_parser.py
@@ -10,6 +10,7 @@ from django.test import TestCase
 from documents.parsers import ParseError
 from documents.parsers import run_convert
 from documents.tests.utils import DirectoriesMixin
+from documents.tests.utils import FileSystemAssertsMixin
 from paperless_tesseract.parsers import post_process_text
 from paperless_tesseract.parsers import RasterisedDocumentParser

@@ -36,7 +37,7 @@ class FakeImageFile(ContextManager):
        return os.path.basename(self.fname)


-class TestParser(DirectoriesMixin, TestCase):
+class TestParser(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")

@@ -88,7 +89,7 @@ class TestParser(DirectoriesMixin, TestCase):
            os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
            "application/pdf",
        )
-        self.assertTrue(os.path.isfile(thumb))
+        self.assertIsFile(thumb)

    @mock.patch("documents.parsers.run_convert")
    def test_thumbnail_fallback(self, m):
@@ -105,7 +106,7 @@ class TestParser(DirectoriesMixin, TestCase):
            os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"),
            "application/pdf",
        )
-        self.assertTrue(os.path.isfile(thumb))
+        self.assertIsFile(thumb)

    def test_thumbnail_encrypted(self):
        parser = RasterisedDocumentParser(uuid.uuid4())
@@ -113,7 +114,7 @@ class TestParser(DirectoriesMixin, TestCase):
            os.path.join(self.SAMPLE_FILES, "encrypted.pdf"),
            "application/pdf",
        )
-        self.assertTrue(os.path.isfile(thumb))
+        self.assertIsFile(thumb)

    def test_get_dpi(self):
        parser = RasterisedDocumentParser(None)
@@ -132,7 +133,7 @@ class TestParser(DirectoriesMixin, TestCase):
            "application/pdf",
        )

-        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertIsFile(parser.archive_path)

        self.assertContainsStrings(parser.get_text(), ["This is a test document."])

@@ -144,7 +145,7 @@ class TestParser(DirectoriesMixin, TestCase):
            "application/pdf",
        )

-        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertIsFile(parser.archive_path)

        self.assertContainsStrings(
            parser.get_text(),
@@ -225,7 +226,7 @@ class TestParser(DirectoriesMixin, TestCase):

        parser.parse(os.path.join(self.SAMPLE_FILES, "simple.png"), "image/png")

-        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertIsFile(parser.archive_path)

        self.assertContainsStrings(parser.get_text(), ["This is a test document."])

@@ -241,7 +242,7 @@ class TestParser(DirectoriesMixin, TestCase):

            parser.parse(dest_file, "image/png")

-            self.assertTrue(os.path.isfile(parser.archive_path))
+            self.assertIsFile(parser.archive_path)

            self.assertContainsStrings(parser.get_text(), ["This is a test document."])

@@ -273,7 +274,7 @@ class TestParser(DirectoriesMixin, TestCase):

        parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png")

-        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertIsFile(parser.archive_path)

        self.assertContainsStrings(
            parser.get_text().lower(),
@@ -286,7 +287,7 @@ class TestParser(DirectoriesMixin, TestCase):
            os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
            "application/pdf",
        )
-        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
@@ -299,7 +300,7 @@ class TestParser(DirectoriesMixin, TestCase):
            os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
            "application/pdf",
        )
-        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
@@ -312,7 +313,7 @@ class TestParser(DirectoriesMixin, TestCase):
            os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
            "application/pdf",
        )
-        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
@@ -325,7 +326,7 @@ class TestParser(DirectoriesMixin, TestCase):
            os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
            "application/pdf",
        )
-        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
@@ -338,7 +339,7 @@ class TestParser(DirectoriesMixin, TestCase):
            os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
            "application/pdf",
        )
-        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
@@ -362,7 +363,7 @@ class TestParser(DirectoriesMixin, TestCase):
            os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
            "application/pdf",
        )
-        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"])
        self.assertNotIn("page 3", parser.get_text().lower())

@@ -384,7 +385,7 @@ class TestParser(DirectoriesMixin, TestCase):
            os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
            "application/pdf",
        )
-        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(parser.get_text().lower(), ["page 1"])
        self.assertNotIn("page 2", parser.get_text().lower())
        self.assertNotIn("page 3", parser.get_text().lower())
@@ -455,7 +456,7 @@ class TestParser(DirectoriesMixin, TestCase):
            "application/pdf",
        )
        self.assertIsNotNone(parser.archive_path)
-        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3", "page 4", "page 5", "page 6"],
@@ -486,7 +487,7 @@ class TestParser(DirectoriesMixin, TestCase):
            "application/pdf",
        )
        self.assertIsNotNone(parser.archive_path)
-        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            [
@@ -556,7 +557,7 @@ class TestParser(DirectoriesMixin, TestCase):
            os.path.join(self.SAMPLE_FILES, "multi-page-images.tiff"),
            "image/tiff",
        )
-        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertIsFile(parser.archive_path)
        self.assertContainsStrings(
            parser.get_text().lower(),
            ["page 1", "page 2", "page 3"],
@@ -580,7 +581,7 @@ class TestParser(DirectoriesMixin, TestCase):
                tmp_file.name,
                "image/tiff",
            )
-            self.assertTrue(os.path.isfile(parser.archive_path))
+            self.assertIsFile(parser.archive_path)
            self.assertContainsStrings(
                parser.get_text().lower(),
                ["page 1", "page 2", "page 3"],
@@ -608,7 +609,7 @@ class TestParser(DirectoriesMixin, TestCase):
                tmp_file.name,
                "image/tiff",
            )
-            self.assertTrue(os.path.isfile(parser.archive_path))
+            self.assertIsFile(parser.archive_path)
            self.assertContainsStrings(
                parser.get_text().lower(),
                ["page 1", "page 2", "page 3"],
@@ -689,40 +690,40 @@ class TestParser(DirectoriesMixin, TestCase):
        self.assertIn("ةﯾﻠﺧﺎدﻻ ةرازو", parser.get_text())


-class TestParserFileTypes(DirectoriesMixin, TestCase):
+class TestParserFileTypes(DirectoriesMixin, FileSystemAssertsMixin, TestCase):

    SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples")

    def test_bmp(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "simple.bmp"), "image/bmp")
-        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertIsFile(parser.archive_path)
        self.assertIn("this is a test document", parser.get_text().lower())

    def test_jpg(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "simple.jpg"), "image/jpeg")
-        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertIsFile(parser.archive_path)
        self.assertIn("this is a test document", parser.get_text().lower())

    @override_settings(OCR_IMAGE_DPI=200)
    def test_gif(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "simple.gif"), "image/gif")
-        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertIsFile(parser.archive_path)
        self.assertIn("this is a test document", parser.get_text().lower())

    def test_tiff(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "simple.tif"), "image/tiff")
-        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertIsFile(parser.archive_path)
        self.assertIn("this is a test document", parser.get_text().lower())

    @override_settings(OCR_IMAGE_DPI=72)
    def test_webp(self):
        parser = RasterisedDocumentParser(None)
        parser.parse(os.path.join(self.SAMPLE_FILES, "document.webp"), "image/webp")
-        self.assertTrue(os.path.isfile(parser.archive_path))
+        self.assertIsFile(parser.archive_path)
        # OCR consistent mangles this space, oh well
        self.assertIn(
            "this is awebp document, created 11/14/2022.",