mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-11-03 03:16:10 -06:00 
			
		
		
		
	catch encrypted pdf documents
This commit is contained in:
		@@ -7,7 +7,7 @@ import ocrmypdf
 | 
			
		||||
import pdftotext
 | 
			
		||||
from PIL import Image
 | 
			
		||||
from django.conf import settings
 | 
			
		||||
from ocrmypdf import InputFileError
 | 
			
		||||
from ocrmypdf import InputFileError, EncryptedPdfError
 | 
			
		||||
 | 
			
		||||
from documents.parsers import DocumentParser, ParseError, run_convert
 | 
			
		||||
 | 
			
		||||
@@ -83,11 +83,12 @@ class RasterisedDocumentParser(DocumentParser):
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
    def parse(self, document_path, mime_type):
 | 
			
		||||
        if settings.OCR_MODE == "skip_noarchive":
 | 
			
		||||
            text = get_text_from_pdf(document_path)
 | 
			
		||||
            if text and len(text) > 50:
 | 
			
		||||
                self.text = text
 | 
			
		||||
                return
 | 
			
		||||
        text_original = get_text_from_pdf(document_path)
 | 
			
		||||
        has_text = text_original and len(text_original) > 50
 | 
			
		||||
 | 
			
		||||
        if settings.OCR_MODE == "skip_noarchive" and has_text:
 | 
			
		||||
            self.text = text_original
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        archive_path = os.path.join(self.tempdir, "archive.pdf")
 | 
			
		||||
 | 
			
		||||
@@ -105,6 +106,8 @@ class RasterisedDocumentParser(DocumentParser):
 | 
			
		||||
        if settings.OCR_PAGES > 0:
 | 
			
		||||
            ocr_args['pages'] = f"1-{settings.OCR_PAGES}"
 | 
			
		||||
 | 
			
		||||
        # Mode selection.
 | 
			
		||||
 | 
			
		||||
        if settings.OCR_MODE in ['skip', 'skip_noarchive']:
 | 
			
		||||
            ocr_args['skip_text'] = True
 | 
			
		||||
        elif settings.OCR_MODE == 'redo':
 | 
			
		||||
@@ -149,11 +152,11 @@ class RasterisedDocumentParser(DocumentParser):
 | 
			
		||||
            self.archive_path = archive_path
 | 
			
		||||
            self.text = get_text_from_pdf(archive_path)
 | 
			
		||||
 | 
			
		||||
        except InputFileError as e:
 | 
			
		||||
        except (InputFileError, EncryptedPdfError) as e:
 | 
			
		||||
            # This happens with some PDFs when used with the redo_ocr option.
 | 
			
		||||
            # This is not the end of the world, we'll just use what we already
 | 
			
		||||
            # have in the document.
 | 
			
		||||
            self.text = get_text_from_pdf(document_path)
 | 
			
		||||
            self.text = text_original
 | 
			
		||||
            # Also, no archived file.
 | 
			
		||||
            if not self.text:
 | 
			
		||||
                # However, if we don't have anything, fail:
 | 
			
		||||
@@ -169,7 +172,7 @@ class RasterisedDocumentParser(DocumentParser):
 | 
			
		||||
                'warning',
 | 
			
		||||
                f"Document {document_path} does not have any text."
 | 
			
		||||
                f"This is probably an error or you tried to add an image "
 | 
			
		||||
                f"without text.")
 | 
			
		||||
                f"without text, or something is wrong with this document.")
 | 
			
		||||
            self.text = ""
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user