mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Merge branch 'master' of github.com:danielquinn/paperless
This commit is contained in:
		| @@ -210,6 +210,9 @@ OCR_LANGUAGE = os.getenv("PAPERLESS_OCR_LANGUAGE", "eng") | ||||
| # The amount of threads to use for OCR | ||||
| OCR_THREADS = os.getenv("PAPERLESS_OCR_THREADS") | ||||
|  | ||||
| # OCR all documents? | ||||
| OCR_ALWAYS = bool(os.getenv("PAPERLESS_OCR_ALWAYS", "NO").lower() in ("yes", "y", "1", "t", "true")) | ||||
|  | ||||
| # If this is true, any failed attempts to OCR a PDF will result in the PDF | ||||
| # being indexed anyway, with whatever we could get.  If it's False, the file | ||||
| # will simply be left in the CONSUMPTION_DIR. | ||||
|   | ||||
| @@ -3,6 +3,7 @@ import os | ||||
| import re | ||||
| import subprocess | ||||
| from multiprocessing.pool import Pool | ||||
| import pdftotext | ||||
|  | ||||
| import langdetect | ||||
| import pyocr | ||||
| @@ -31,6 +32,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|     THREADS = int(settings.OCR_THREADS) if settings.OCR_THREADS else None | ||||
|     UNPAPER = settings.UNPAPER_BINARY | ||||
|     DEFAULT_OCR_LANGUAGE = settings.OCR_LANGUAGE | ||||
|     OCR_ALWAYS = settings.OCR_ALWAYS | ||||
|  | ||||
|     def get_thumbnail(self): | ||||
|         """ | ||||
| @@ -46,7 +48,21 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|  | ||||
|         return os.path.join(self.tempdir, "convert-0000.png") | ||||
|  | ||||
|     def _is_ocred(self): | ||||
|         # Extract text from PDF using pdftotext | ||||
|         text = get_text_from_pdf(self.document_path) | ||||
|  | ||||
|         # We assume, that a PDF with at least 50 characters contains text | ||||
|         # (so no OCR required) | ||||
|         if len(text) > 50: | ||||
|             return True | ||||
|  | ||||
|         return False | ||||
|  | ||||
|     def get_text(self): | ||||
|         if not self.OCR_ALWAYS and self._is_ocred(): | ||||
|             self.log("info", "Skipping OCR, using Text from PDF") | ||||
|             return get_text_from_pdf(self.document_path) | ||||
|  | ||||
|         images = self._get_greyscale() | ||||
|  | ||||
| @@ -212,3 +228,13 @@ def image_to_string(args): | ||||
|             except (TesseractError, OtherTesseractError): | ||||
|                 pass | ||||
|         return ocr.image_to_string(f, lang=lang) | ||||
|  | ||||
|  | ||||
| def get_text_from_pdf(pdf_file): | ||||
|     with open(pdf_file, "rb") as f: | ||||
|         try: | ||||
|             pdf = pdftotext.PDF(f) | ||||
|         except pdftotext.Error: | ||||
|             return False | ||||
|  | ||||
|     return "\n".join(pdf) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Daniel Quinn
					Daniel Quinn