diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index 1cb79959a..abb3d3dfe 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -276,7 +276,10 @@ class RasterisedDocumentParser(DocumentParser): self.log("debug", f"Calling OCRmyPDF with args: {args}") ocrmypdf.ocr(**args) - self.archive_path = archive_path + # Only create archive file if archiving isn't being skipped + if settings.OCR_MODE != "skip_noarchive": + self.archive_path = archive_path + self.text = self.extract_text(sidecar_file, archive_path) if not self.text: diff --git a/src/paperless_tesseract/tests/test_parser.py b/src/paperless_tesseract/tests/test_parser.py index 6bf8bd5f4..700782a92 100644 --- a/src/paperless_tesseract/tests/test_parser.py +++ b/src/paperless_tesseract/tests/test_parser.py @@ -364,6 +364,16 @@ class TestParser(DirectoriesMixin, TestCase): @override_settings(OCR_MODE="skip_noarchive") def test_skip_noarchive_withtext(self): + """ + GIVEN: + - File with existing text layer + - OCR mode set to skip_noarchive + WHEN: + - Document is parsed + THEN: + - Text from images is extracted + - No archive file is created + """ parser = RasterisedDocumentParser(None) parser.parse( os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), @@ -377,17 +387,29 @@ class TestParser(DirectoriesMixin, TestCase): @override_settings(OCR_MODE="skip_noarchive") def test_skip_noarchive_notext(self): + """ + GIVEN: + - File with text contained in images but no text layer + - OCR mode set to skip_noarchive + WHEN: + - Document is parsed + THEN: + - Text from images is extracted + - No archive file is created + """ parser = RasterisedDocumentParser(None) parser.parse( os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf", ) - self.assertTrue(os.path.isfile(parser.archive_path)) + self.assertContainsStrings( parser.get_text().lower(), ["page 1", "page 2", "page 3"], ) + self.assertIsNone(parser.archive_path) + @override_settings(OCR_MODE="skip") def test_multi_page_mixed(self): parser = RasterisedDocumentParser(None) @@ -408,6 +430,16 @@ class TestParser(DirectoriesMixin, TestCase): @override_settings(OCR_MODE="skip_noarchive") def test_multi_page_mixed_no_archive(self): + """ + GIVEN: + - File with some text contained in images and some in text layer + - OCR mode set to skip_noarchive + WHEN: + - Document is parsed + THEN: + - Text from images is extracted + - No archive file is created + """ parser = RasterisedDocumentParser(None) parser.parse( os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),