Add PAPERLESS_OCR_SKIP_ARCHIVE_FILE config setting

This commit is contained in:
Brandon Rothweiler
2023-02-23 22:42:57 -05:00
parent 8a89f5ae27
commit ca412e0184
8 changed files with 185 additions and 14 deletions

View File

@@ -294,7 +294,11 @@ class RasterisedDocumentParser(DocumentParser):
# If the original has text, and the user doesn't want an archive,
# we're done here
if settings.OCR_MODE == "skip_noarchive" and original_has_text:
skip_archive_for_text = (
settings.OCR_MODE == "skip_noarchive"
or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"]
)
if skip_archive_for_text and original_has_text:
self.log("debug", "Document has text, skipping OCRmyPDF entirely.")
self.text = text_original
return
@@ -320,7 +324,8 @@ class RasterisedDocumentParser(DocumentParser):
self.log("debug", f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args)
self.archive_path = archive_path
if settings.OCR_SKIP_ARCHIVE_FILE != "always":
self.archive_path = archive_path
self.text = self.extract_text(sidecar_file, archive_path)