mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-09 09:58:20 -05:00
Reverts the change around skip_noarchive to align with how it is documented to work
This commit is contained in:
parent
53e8d84af2
commit
d1aa08850d
@ -249,16 +249,22 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
|
|
||||||
if mime_type == "application/pdf":
|
if mime_type == "application/pdf":
|
||||||
text_original = self.extract_text(None, document_path)
|
text_original = self.extract_text(None, document_path)
|
||||||
original_has_text = text_original and len(text_original) > 50
|
original_has_text = text_original is not None and len(text_original) > 50
|
||||||
else:
|
else:
|
||||||
text_original = None
|
text_original = None
|
||||||
original_has_text = False
|
original_has_text = False
|
||||||
|
|
||||||
|
# If the original has text, and the user doesn't want an archive,
|
||||||
|
# we're done here
|
||||||
if settings.OCR_MODE == "skip_noarchive" and original_has_text:
|
if settings.OCR_MODE == "skip_noarchive" and original_has_text:
|
||||||
self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
|
self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
|
||||||
self.text = text_original
|
self.text = text_original
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Either no text was in the original or there should be an archive
|
||||||
|
# file created, so OCR the file and create an archive with any
|
||||||
|
# test located via OCR
|
||||||
|
|
||||||
import ocrmypdf
|
import ocrmypdf
|
||||||
from ocrmypdf import InputFileError, EncryptedPdfError
|
from ocrmypdf import InputFileError, EncryptedPdfError
|
||||||
|
|
||||||
@ -276,9 +282,7 @@ class RasterisedDocumentParser(DocumentParser):
|
|||||||
self.log("debug", f"Calling OCRmyPDF with args: {args}")
|
self.log("debug", f"Calling OCRmyPDF with args: {args}")
|
||||||
ocrmypdf.ocr(**args)
|
ocrmypdf.ocr(**args)
|
||||||
|
|
||||||
# Only create archive file if archiving isn't being skipped
|
self.archive_path = archive_path
|
||||||
if settings.OCR_MODE != "skip_noarchive":
|
|
||||||
self.archive_path = archive_path
|
|
||||||
|
|
||||||
self.text = self.extract_text(sidecar_file, archive_path)
|
self.text = self.extract_text(sidecar_file, archive_path)
|
||||||
|
|
||||||
|
@ -341,6 +341,17 @@ class TestParser(DirectoriesMixin, TestCase):
|
|||||||
|
|
||||||
@override_settings(OCR_PAGES=2, OCR_MODE="redo")
|
@override_settings(OCR_PAGES=2, OCR_MODE="redo")
|
||||||
def test_multi_page_analog_pages_redo(self):
|
def test_multi_page_analog_pages_redo(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- File with text contained in images but no text layer
|
||||||
|
- OCR of only pages 1 and 2 requested
|
||||||
|
- OCR mode set to redo
|
||||||
|
WHEN:
|
||||||
|
- Document is parsed
|
||||||
|
THEN:
|
||||||
|
- Text of page 1 and 2 extracted
|
||||||
|
- An archive file is created
|
||||||
|
"""
|
||||||
parser = RasterisedDocumentParser(None)
|
parser = RasterisedDocumentParser(None)
|
||||||
parser.parse(
|
parser.parse(
|
||||||
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
|
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
|
||||||
@ -352,6 +363,17 @@ class TestParser(DirectoriesMixin, TestCase):
|
|||||||
|
|
||||||
@override_settings(OCR_PAGES=1, OCR_MODE="force")
|
@override_settings(OCR_PAGES=1, OCR_MODE="force")
|
||||||
def test_multi_page_analog_pages_force(self):
|
def test_multi_page_analog_pages_force(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- File with text contained in images but no text layer
|
||||||
|
- OCR of only page 1 requested
|
||||||
|
- OCR mode set to force
|
||||||
|
WHEN:
|
||||||
|
- Document is parsed
|
||||||
|
THEN:
|
||||||
|
- Only text of page 1 is extracted
|
||||||
|
- An archive file is created
|
||||||
|
"""
|
||||||
parser = RasterisedDocumentParser(None)
|
parser = RasterisedDocumentParser(None)
|
||||||
parser.parse(
|
parser.parse(
|
||||||
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
|
os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"),
|
||||||
@ -395,7 +417,7 @@ class TestParser(DirectoriesMixin, TestCase):
|
|||||||
- Document is parsed
|
- Document is parsed
|
||||||
THEN:
|
THEN:
|
||||||
- Text from images is extracted
|
- Text from images is extracted
|
||||||
- No archive file is created
|
- An archive file is created with the OCRd text
|
||||||
"""
|
"""
|
||||||
parser = RasterisedDocumentParser(None)
|
parser = RasterisedDocumentParser(None)
|
||||||
parser.parse(
|
parser.parse(
|
||||||
@ -408,15 +430,26 @@ class TestParser(DirectoriesMixin, TestCase):
|
|||||||
["page 1", "page 2", "page 3"],
|
["page 1", "page 2", "page 3"],
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertIsNone(parser.archive_path)
|
self.assertIsNotNone(parser.archive_path)
|
||||||
|
|
||||||
@override_settings(OCR_MODE="skip")
|
@override_settings(OCR_MODE="skip")
|
||||||
def test_multi_page_mixed(self):
|
def test_multi_page_mixed(self):
|
||||||
|
"""
|
||||||
|
GIVEN:
|
||||||
|
- File with some text contained in images and some in text layer
|
||||||
|
- OCR mode set to skip
|
||||||
|
WHEN:
|
||||||
|
- Document is parsed
|
||||||
|
THEN:
|
||||||
|
- Text from images is extracted
|
||||||
|
- An archive file is created with the OCRd text and the original text
|
||||||
|
"""
|
||||||
parser = RasterisedDocumentParser(None)
|
parser = RasterisedDocumentParser(None)
|
||||||
parser.parse(
|
parser.parse(
|
||||||
os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
|
os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"),
|
||||||
"application/pdf",
|
"application/pdf",
|
||||||
)
|
)
|
||||||
|
self.assertIsNotNone(parser.archive_path)
|
||||||
self.assertTrue(os.path.isfile(parser.archive_path))
|
self.assertTrue(os.path.isfile(parser.archive_path))
|
||||||
self.assertContainsStrings(
|
self.assertContainsStrings(
|
||||||
parser.get_text().lower(),
|
parser.get_text().lower(),
|
||||||
@ -438,7 +471,7 @@ class TestParser(DirectoriesMixin, TestCase):
|
|||||||
- Document is parsed
|
- Document is parsed
|
||||||
THEN:
|
THEN:
|
||||||
- Text from images is extracted
|
- Text from images is extracted
|
||||||
- No archive file is created
|
- No archive file is created as original file contains text
|
||||||
"""
|
"""
|
||||||
parser = RasterisedDocumentParser(None)
|
parser = RasterisedDocumentParser(None)
|
||||||
parser.parse(
|
parser.parse(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user