mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
Merge pull request #1442 from paperless-ngx/fix/skip-archive-still-archiving
Bugfix: Fixes the creation of an archive file, even if noarchive was specified
This commit is contained in:
commit
1692bac3fe
@ -276,7 +276,10 @@ class RasterisedDocumentParser(DocumentParser):
|
||||
self.log("debug", f"Calling OCRmyPDF with args: {args}")
|
||||
ocrmypdf.ocr(**args)
|
||||
|
||||
self.archive_path = archive_path
|
||||
# Only create archive file if archiving isn't being skipped
|
||||
if settings.OCR_MODE != "skip_noarchive":
|
||||
self.archive_path = archive_path
|
||||
|
||||
self.text = self.extract_text(sidecar_file, archive_path)
|
||||
|
||||
if not self.text:
|
||||
|
@ -364,6 +364,16 @@ class TestParser(DirectoriesMixin, TestCase):
|
||||
|
||||
@override_settings(OCR_MODE="skip_noarchive")
|
||||
def test_skip_noarchive_withtext(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- File with existing text layer
|
||||
- OCR mode set to skip_noarchive
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from images is extracted
|
||||
- No archive file is created
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"),
|
||||
@ -377,17 +387,29 @@ class TestParser(DirectoriesMixin, TestCase):
|
||||
|
||||
@override_settings(OCR_MODE="skip_noarchive")
|
||||
def test_skip_noarchive_notext(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- File with text contained in images but no text layer
|
||||
- OCR mode set to skip_noarchive
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from images is extracted
|
||||
- No archive file is created
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
|
||||
"application/pdf",
|
||||
)
|
||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||
|
||||
self.assertContainsStrings(
|
||||
parser.get_text().lower(),
|
||||
["page 1", "page 2", "page 3"],
|
||||
)
|
||||
|
||||
self.assertIsNone(parser.archive_path)
|
||||
|
||||
@override_settings(OCR_MODE="skip")
|
||||
def test_multi_page_mixed(self):
|
||||
parser = RasterisedDocumentParser(None)
|
||||
@ -408,6 +430,16 @@ class TestParser(DirectoriesMixin, TestCase):
|
||||
|
||||
@override_settings(OCR_MODE="skip_noarchive")
|
||||
def test_multi_page_mixed_no_archive(self):
|
||||
"""
|
||||
GIVEN:
|
||||
- File with some text contained in images and some in text layer
|
||||
- OCR mode set to skip_noarchive
|
||||
WHEN:
|
||||
- Document is parsed
|
||||
THEN:
|
||||
- Text from images is extracted
|
||||
- No archive file is created
|
||||
"""
|
||||
parser = RasterisedDocumentParser(None)
|
||||
parser.parse(
|
||||
os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
|
||||
|
Loading…
x
Reference in New Issue
Block a user