mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	completely reworked the OCRmyPDF parser.
This commit is contained in:
		| @@ -41,6 +41,10 @@ | |||||||
| #PAPERLESS_OCR_OUTPUT_TYPE=pdfa | #PAPERLESS_OCR_OUTPUT_TYPE=pdfa | ||||||
| #PAPERLESS_OCR_PAGES=1 | #PAPERLESS_OCR_PAGES=1 | ||||||
| #PAPERLESS_OCR_IMAGE_DPI=300 | #PAPERLESS_OCR_IMAGE_DPI=300 | ||||||
|  | #PAPERLESS_OCR_CLEAN=clean | ||||||
|  | #PAPERLESS_OCR_DESKEW=false | ||||||
|  | #PAPERLESS_OCR_ROTATE_PAGES=false | ||||||
|  | #PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD=10 | ||||||
| #PAPERLESS_OCR_USER_ARGS={} | #PAPERLESS_OCR_USER_ARGS={} | ||||||
| #PAPERLESS_CONVERT_MEMORY_LIMIT=0 | #PAPERLESS_CONVERT_MEMORY_LIMIT=0 | ||||||
| #PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless | #PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless | ||||||
|   | |||||||
| @@ -449,6 +449,14 @@ OCR_MODE = os.getenv("PAPERLESS_OCR_MODE", "skip") | |||||||
|  |  | ||||||
| OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI") | OCR_IMAGE_DPI = os.getenv("PAPERLESS_OCR_IMAGE_DPI") | ||||||
|  |  | ||||||
|  | OCR_CLEAN = os.getenv("PAPERLESS_OCR_CLEAN", "clean") | ||||||
|  |  | ||||||
|  | OCR_DESKEW = __get_boolean("PAPERLESS_OCR_DESKEW") | ||||||
|  |  | ||||||
|  | OCR_ROTATE_PAGES = __get_boolean("PAPERLESS_OCR_ROTATE_PAGES") | ||||||
|  |  | ||||||
|  | OCR_ROTATE_PAGES_THRESHOLD = float(os.getenv("PAPERLESS_OCR_ROTATE_PAGES_THRESHOLD", 10.0)) | ||||||
|  |  | ||||||
| OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}") | OCR_USER_ARGS = os.getenv("PAPERLESS_OCR_USER_ARGS", "{}") | ||||||
|  |  | ||||||
| # GNUPG needs a home directory for some reason | # GNUPG needs a home directory for some reason | ||||||
|   | |||||||
| @@ -9,6 +9,10 @@ from documents.parsers import DocumentParser, ParseError, \ | |||||||
|     make_thumbnail_from_pdf |     make_thumbnail_from_pdf | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class NoTextFoundException(Exception): | ||||||
|  |     pass | ||||||
|  |  | ||||||
|  |  | ||||||
| class RasterisedDocumentParser(DocumentParser): | class RasterisedDocumentParser(DocumentParser): | ||||||
|     """ |     """ | ||||||
|     This parser uses Tesseract to try and get some text out of a rasterised |     This parser uses Tesseract to try and get some text out of a rasterised | ||||||
| @@ -18,12 +22,13 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|     logging_name = "paperless.parsing.tesseract" |     logging_name = "paperless.parsing.tesseract" | ||||||
|  |  | ||||||
|     def extract_metadata(self, document_path, mime_type): |     def extract_metadata(self, document_path, mime_type): | ||||||
|  |  | ||||||
|  |         result = [] | ||||||
|  |         if mime_type == 'application/pdf': | ||||||
|             import pikepdf |             import pikepdf | ||||||
|  |  | ||||||
|             namespace_pattern = re.compile(r"\{(.*)\}(.*)") |             namespace_pattern = re.compile(r"\{(.*)\}(.*)") | ||||||
|  |  | ||||||
|         result = [] |  | ||||||
|         if mime_type == 'application/pdf': |  | ||||||
|             pdf = pikepdf.open(document_path) |             pdf = pikepdf.open(document_path) | ||||||
|             meta = pdf.open_metadata() |             meta = pdf.open_metadata() | ||||||
|             for key, value in meta.items(): |             for key, value in meta.items(): | ||||||
| @@ -88,125 +93,199 @@ class RasterisedDocumentParser(DocumentParser): | |||||||
|                 f"Error while calculating DPI for image {image}: {e}") |                 f"Error while calculating DPI for image {image}: {e}") | ||||||
|             return None |             return None | ||||||
|  |  | ||||||
|  |     def extract_text(self, sidecar_file, pdf_file): | ||||||
|  |         if sidecar_file and os.path.isfile(sidecar_file): | ||||||
|  |             with open(sidecar_file, "r") as f: | ||||||
|  |                 text = f.read() | ||||||
|  |  | ||||||
|  |             if "[OCR skipped on page" not in text: | ||||||
|  |                 # This happens when there's already text in the input file. | ||||||
|  |                 # The sidecar file will only contain text for OCR'ed pages. | ||||||
|  |                 self.log("debug", "Using text from sidecar file") | ||||||
|  |                 return text | ||||||
|  |             else: | ||||||
|  |                 self.log("debug", "Incomplete sidecar file: discarding.") | ||||||
|  |  | ||||||
|  |         # no success with the sidecar file, try PDF | ||||||
|  |  | ||||||
|  |         if not os.path.isfile(pdf_file): | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |         from pdfminer.high_level import extract_text | ||||||
|  |         from pdfminer.pdftypes import PDFException | ||||||
|  |  | ||||||
|  |         try: | ||||||
|  |             text = extract_text(pdf_file) | ||||||
|  |             stripped = strip_excess_whitespace(text) | ||||||
|  |             self.log("debug", f"Extracted text from PDF file {pdf_file}") | ||||||
|  |             return stripped | ||||||
|  |         except PDFException: | ||||||
|  |             # probably not a PDF file. | ||||||
|  |             return None | ||||||
|  |  | ||||||
|  |     def construct_ocrmypdf_parameters(self, | ||||||
|  |                                       input_file, | ||||||
|  |                                       mime_type, | ||||||
|  |                                       output_file, | ||||||
|  |                                       sidecar_file, | ||||||
|  |                                       safe_fallback=False): | ||||||
|  |         ocrmypdf_args = { | ||||||
|  |             'input_file': input_file, | ||||||
|  |             'output_file': output_file, | ||||||
|  |             # need to use threads, since this will be run in daemonized | ||||||
|  |             # processes by django-q. | ||||||
|  |             'use_threads': True, | ||||||
|  |             'jobs': settings.THREADS_PER_WORKER, | ||||||
|  |             'language': settings.OCR_LANGUAGE, | ||||||
|  |             'output_type': settings.OCR_OUTPUT_TYPE, | ||||||
|  |             'progress_bar': False | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         if settings.OCR_MODE == 'force' or safe_fallback: | ||||||
|  |             ocrmypdf_args['force_ocr'] = True | ||||||
|  |         elif settings.OCR_MODE in ['skip', 'skip_noarchive']: | ||||||
|  |             ocrmypdf_args['skip_text'] = True | ||||||
|  |         elif settings.OCR_MODE == 'redo': | ||||||
|  |             ocrmypdf_args['redo_ocr'] = True | ||||||
|  |         else: | ||||||
|  |             raise ParseError( | ||||||
|  |                 f"Invalid ocr mode: {settings.OCR_MODE}") | ||||||
|  |  | ||||||
|  |         if settings.OCR_CLEAN == 'clean': | ||||||
|  |             ocrmypdf_args['clean'] = True | ||||||
|  |         elif settings.OCR_CLEAN == 'clean-final': | ||||||
|  |             ocrmypdf_args['clean_final'] = True | ||||||
|  |  | ||||||
|  |         if settings.OCR_DESKEW: | ||||||
|  |             ocrmypdf_args['deskew'] = True | ||||||
|  |  | ||||||
|  |         if settings.OCR_ROTATE_PAGES: | ||||||
|  |             ocrmypdf_args['rotate_pages'] = True | ||||||
|  |             ocrmypdf_args['rotate_pages_threshold'] = settings.OCR_ROTATE_PAGES_THRESHOLD  # NOQA: E501 | ||||||
|  |  | ||||||
|  |         if settings.OCR_PAGES > 0: | ||||||
|  |             ocrmypdf_args['pages'] = f"1-{settings.OCR_PAGES}" | ||||||
|  |         else: | ||||||
|  |             # sidecar is incompatible with pages | ||||||
|  |             ocrmypdf_args['sidecar'] = sidecar_file | ||||||
|  |  | ||||||
|  |         if self.is_image(mime_type): | ||||||
|  |             dpi = self.get_dpi(input_file) | ||||||
|  |             a4_dpi = self.calculate_a4_dpi(input_file) | ||||||
|  |             if dpi: | ||||||
|  |                 self.log( | ||||||
|  |                     "debug", | ||||||
|  |                     f"Detected DPI for image {input_file}: {dpi}" | ||||||
|  |                 ) | ||||||
|  |                 ocrmypdf_args['image_dpi'] = dpi | ||||||
|  |             elif settings.OCR_IMAGE_DPI: | ||||||
|  |                 ocrmypdf_args['image_dpi'] = settings.OCR_IMAGE_DPI | ||||||
|  |             elif a4_dpi: | ||||||
|  |                 ocrmypdf_args['image_dpi'] = a4_dpi | ||||||
|  |             else: | ||||||
|  |                 raise ParseError( | ||||||
|  |                     f"Cannot produce archive PDF for image {input_file}, " | ||||||
|  |                     f"no DPI information is present in this image and " | ||||||
|  |                     f"OCR_IMAGE_DPI is not set.") | ||||||
|  |  | ||||||
|  |         if settings.OCR_USER_ARGS and not safe_fallback: | ||||||
|  |             try: | ||||||
|  |                 user_args = json.loads(settings.OCR_USER_ARGS) | ||||||
|  |                 ocrmypdf_args = {**ocrmypdf_args, **user_args} | ||||||
|  |             except Exception as e: | ||||||
|  |                 self.log( | ||||||
|  |                     "warning", | ||||||
|  |                     f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " | ||||||
|  |                     f"they will not be used. Error: {e}") | ||||||
|  |  | ||||||
|  |         return ocrmypdf_args | ||||||
|  |  | ||||||
|     def parse(self, document_path, mime_type, file_name=None): |     def parse(self, document_path, mime_type, file_name=None): | ||||||
|         import ocrmypdf |         # This forces tesseract to use one core per page. | ||||||
|         from ocrmypdf import InputFileError, EncryptedPdfError |         os.environ['OMP_THREAD_LIMIT'] = "1" | ||||||
|  |  | ||||||
|         mode = settings.OCR_MODE |         text_original = self.extract_text(None, document_path) | ||||||
|  |         original_has_text = text_original and len(text_original) > 50 | ||||||
|  |  | ||||||
|         text_original = get_text_from_pdf(document_path) |         if settings.OCR_MODE == "skip_noarchive" and original_has_text: | ||||||
|         has_text = text_original and len(text_original) > 50 |  | ||||||
|  |  | ||||||
|         if mode == "skip_noarchive" and has_text: |  | ||||||
|             self.log("debug", |             self.log("debug", | ||||||
|                      "Document has text, skipping OCRmyPDF entirely.") |                      "Document has text, skipping OCRmyPDF entirely.") | ||||||
|             self.text = text_original |             self.text = text_original | ||||||
|             return |             return | ||||||
|  |  | ||||||
|         if mode in ['skip', 'skip_noarchive'] and not has_text: |         import ocrmypdf | ||||||
|             # upgrade to redo, since there appears to be no text in the |         from ocrmypdf import InputFileError, EncryptedPdfError | ||||||
|             # document. This happens to some weird encrypted documents or |  | ||||||
|             # documents with failed OCR attempts for which OCRmyPDF will |  | ||||||
|             # still report that there actually is text in them. |  | ||||||
|             self.log("debug", |  | ||||||
|                      "No text was found in the document and skip is " |  | ||||||
|                      "specified. Upgrading OCR mode to redo.") |  | ||||||
|             mode = "redo" |  | ||||||
|  |  | ||||||
|         archive_path = os.path.join(self.tempdir, "archive.pdf") |         archive_path = os.path.join(self.tempdir, "archive.pdf") | ||||||
|  |         sidecar_file = os.path.join(self.tempdir, "sidecar.txt") | ||||||
|  |  | ||||||
|         ocr_args = { |         args = self.construct_ocrmypdf_parameters( | ||||||
|             'input_file': document_path, |             document_path, mime_type, archive_path, sidecar_file) | ||||||
|             'output_file': archive_path, |  | ||||||
|             'use_threads': True, |  | ||||||
|             'jobs': settings.THREADS_PER_WORKER, |  | ||||||
|             'language': settings.OCR_LANGUAGE, |  | ||||||
|             'output_type': settings.OCR_OUTPUT_TYPE, |  | ||||||
|             'progress_bar': False, |  | ||||||
|             'clean': True |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         if settings.OCR_PAGES > 0: |  | ||||||
|             ocr_args['pages'] = f"1-{settings.OCR_PAGES}" |  | ||||||
|  |  | ||||||
|         # Mode selection. |  | ||||||
|  |  | ||||||
|         if mode in ['skip', 'skip_noarchive']: |  | ||||||
|             ocr_args['skip_text'] = True |  | ||||||
|         elif mode == 'redo': |  | ||||||
|             ocr_args['redo_ocr'] = True |  | ||||||
|         elif mode == 'force': |  | ||||||
|             ocr_args['force_ocr'] = True |  | ||||||
|         else: |  | ||||||
|             raise ParseError( |  | ||||||
|                 f"Invalid ocr mode: {mode}") |  | ||||||
|  |  | ||||||
|         if self.is_image(mime_type): |  | ||||||
|             dpi = self.get_dpi(document_path) |  | ||||||
|             a4_dpi = self.calculate_a4_dpi(document_path) |  | ||||||
|             if dpi: |  | ||||||
|                 self.log( |  | ||||||
|                     "debug", |  | ||||||
|                     f"Detected DPI for image {document_path}: {dpi}" |  | ||||||
|                 ) |  | ||||||
|                 ocr_args['image_dpi'] = dpi |  | ||||||
|             elif settings.OCR_IMAGE_DPI: |  | ||||||
|                 ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI |  | ||||||
|             elif a4_dpi: |  | ||||||
|                 ocr_args['image_dpi'] = a4_dpi |  | ||||||
|             else: |  | ||||||
|                 raise ParseError( |  | ||||||
|                     f"Cannot produce archive PDF for image {document_path}, " |  | ||||||
|                     f"no DPI information is present in this image and " |  | ||||||
|                     f"OCR_IMAGE_DPI is not set.") |  | ||||||
|  |  | ||||||
|         if settings.OCR_USER_ARGS: |  | ||||||
|             try: |  | ||||||
|                 user_args = json.loads(settings.OCR_USER_ARGS) |  | ||||||
|                 ocr_args = {**ocr_args, **user_args} |  | ||||||
|             except Exception as e: |  | ||||||
|                 self.log( |  | ||||||
|                     "warning", |  | ||||||
|                     f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " |  | ||||||
|                     f"they will not be used: {e}") |  | ||||||
|  |  | ||||||
|         # This forces tesseract to use one core per page. |  | ||||||
|         os.environ['OMP_THREAD_LIMIT'] = "1" |  | ||||||
|  |  | ||||||
|         try: |         try: | ||||||
|             self.log("debug", |             self.log("debug", f"Calling OCRmyPDF with args: {args}") | ||||||
|                      f"Calling OCRmyPDF with {str(ocr_args)}") |             ocrmypdf.ocr(**args) | ||||||
|             ocrmypdf.ocr(**ocr_args) |  | ||||||
|             # success! announce results |  | ||||||
|             self.archive_path = archive_path |             self.archive_path = archive_path | ||||||
|             self.text = get_text_from_pdf(archive_path) |             self.text = self.extract_text(sidecar_file, archive_path) | ||||||
|  |  | ||||||
|         except (InputFileError, EncryptedPdfError) as e: |  | ||||||
|  |  | ||||||
|             self.log("debug", |  | ||||||
|                      f"Encountered an error: {e}. Trying to use text from " |  | ||||||
|                      f"original.") |  | ||||||
|             # This happens with some PDFs when used with the redo_ocr option. |  | ||||||
|             # This is not the end of the world, we'll just use what we already |  | ||||||
|             # have in the document. |  | ||||||
|             self.text = text_original |  | ||||||
|             # Also, no archived file. |  | ||||||
|             if not self.text: |             if not self.text: | ||||||
|                 # However, if we don't have anything, fail: |                 raise NoTextFoundException( | ||||||
|  |                     "No text was found in the original document") | ||||||
|  |         except EncryptedPdfError: | ||||||
|  |             self.log("warning", | ||||||
|  |                      "This file is encrypted, OCR is impossible. Using " | ||||||
|  |                      "any text present in the original file.") | ||||||
|  |             if original_has_text: | ||||||
|  |                 self.text = text_original | ||||||
|  |         except (NoTextFoundException, InputFileError) as e: | ||||||
|  |             self.log("exception", | ||||||
|  |                      f"Encountered the following error while running OCR, " | ||||||
|  |                      f"attempting force OCR to get the text.") | ||||||
|  |  | ||||||
|  |             archive_path_fallback = os.path.join( | ||||||
|  |                 self.tempdir, "archive-fallback.pdf") | ||||||
|  |             sidecar_file_fallback = os.path.join( | ||||||
|  |                 self.tempdir, "sidecar-fallback.txt") | ||||||
|  |  | ||||||
|  |             # Attempt to run OCR with safe settings. | ||||||
|  |  | ||||||
|  |             args = self.construct_ocrmypdf_parameters( | ||||||
|  |                 document_path, mime_type, | ||||||
|  |                 archive_path_fallback, sidecar_file_fallback, | ||||||
|  |                 safe_fallback=True | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |             try: | ||||||
|  |                 self.log("debug", | ||||||
|  |                          f"Fallback: Calling OCRmyPDF with args: {args}") | ||||||
|  |                 ocrmypdf.ocr(**args) | ||||||
|  |  | ||||||
|  |                 # Don't return the archived file here, since this file | ||||||
|  |                 # is bigger and blurry due to --force-ocr. | ||||||
|  |  | ||||||
|  |                 self.text = self.extract_text( | ||||||
|  |                     sidecar_file_fallback, archive_path_fallback) | ||||||
|  |  | ||||||
|  |             except Exception as e: | ||||||
|  |                 # If this fails, we have a serious issue at hand. | ||||||
|                 raise ParseError(f"{e.__class__.__name__}: {str(e)}") |                 raise ParseError(f"{e.__class__.__name__}: {str(e)}") | ||||||
|  |  | ||||||
|         except Exception as e: |         except Exception as e: | ||||||
|             # Anything else is probably serious. |             # Anything else is probably serious. | ||||||
|             raise ParseError(f"{e.__class__.__name__}: {str(e)}") |             raise ParseError(f"{e.__class__.__name__}: {str(e)}") | ||||||
|  |  | ||||||
|  |         # As a last resort, if we still don't have any text for any reason, | ||||||
|  |         # try to extract the text from the original document. | ||||||
|         if not self.text: |         if not self.text: | ||||||
|             # This may happen for files that don't have any text. |             if original_has_text: | ||||||
|  |                 self.text = text_original | ||||||
|  |             else: | ||||||
|                 self.log( |                 self.log( | ||||||
|                 'warning', |                     "warning", | ||||||
|                 f"Document {document_path} does not have any text. " |                     f"No text was found in {document_path}, the content will " | ||||||
|                 f"This is probably an error or you tried to add an image " |                     f"be empty." | ||||||
|                 f"without text, or something is wrong with this document.") |                 ) | ||||||
|             self.text = "" |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def strip_excess_whitespace(text): | def strip_excess_whitespace(text): | ||||||
| @@ -222,20 +301,3 @@ def strip_excess_whitespace(text): | |||||||
|     # TODO: this needs a rework |     # TODO: this needs a rework | ||||||
|     return no_trailing_whitespace.strip() |     return no_trailing_whitespace.strip() | ||||||
|  |  | ||||||
|  |  | ||||||
| def get_text_from_pdf(pdf_file): |  | ||||||
|     import pdftotext |  | ||||||
|  |  | ||||||
|     if not os.path.isfile(pdf_file): |  | ||||||
|         return None |  | ||||||
|  |  | ||||||
|     with open(pdf_file, "rb") as f: |  | ||||||
|         try: |  | ||||||
|             pdf = pdftotext.PDF(f) |  | ||||||
|         except pdftotext.Error: |  | ||||||
|             # might not be a PDF file |  | ||||||
|             return None |  | ||||||
|  |  | ||||||
|     text = "\n".join(pdf) |  | ||||||
|  |  | ||||||
|     return strip_excess_whitespace(text) |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 jonaswinkler
					jonaswinkler