mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Format Python code with black
This commit is contained in:
		| @@ -5,8 +5,7 @@ from django.core.checks import Error, Warning, register | ||||
|  | ||||
|  | ||||
| def get_tesseract_langs(): | ||||
|     with subprocess.Popen(['tesseract', '--list-langs'], | ||||
|                           stdout=subprocess.PIPE) as p: | ||||
|     with subprocess.Popen(["tesseract", "--list-langs"], stdout=subprocess.PIPE) as p: | ||||
|         stdout, stderr = p.communicate() | ||||
|  | ||||
|     return stdout.decode().strip().split("\n")[1:] | ||||
| @@ -17,18 +16,23 @@ def check_default_language_available(app_configs, **kwargs): | ||||
|     installed_langs = get_tesseract_langs() | ||||
|  | ||||
|     if not settings.OCR_LANGUAGE: | ||||
|         return [Warning( | ||||
|             "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. " | ||||
|             "This means that tesseract will fallback to english." | ||||
|         )] | ||||
|         return [ | ||||
|             Warning( | ||||
|                 "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE. " | ||||
|                 "This means that tesseract will fallback to english." | ||||
|             ) | ||||
|         ] | ||||
|  | ||||
|     specified_langs = settings.OCR_LANGUAGE.split("+") | ||||
|  | ||||
|     for lang in specified_langs: | ||||
|         if lang not in installed_langs: | ||||
|             return [Error( | ||||
|                 f"The selected ocr language {lang} is " | ||||
|                 f"not installed. Paperless cannot OCR your documents " | ||||
|                 f"without it. Please fix PAPERLESS_OCR_LANGUAGE.")] | ||||
|             return [ | ||||
|                 Error( | ||||
|                     f"The selected ocr language {lang} is " | ||||
|                     f"not installed. Paperless cannot OCR your documents " | ||||
|                     f"without it. Please fix PAPERLESS_OCR_LANGUAGE." | ||||
|                 ) | ||||
|             ] | ||||
|  | ||||
|     return [] | ||||
|   | ||||
| @@ -5,8 +5,7 @@ import re | ||||
| from PIL import Image | ||||
| from django.conf import settings | ||||
|  | ||||
| from documents.parsers import DocumentParser, ParseError, \ | ||||
|     make_thumbnail_from_pdf | ||||
| from documents.parsers import DocumentParser, ParseError, make_thumbnail_from_pdf | ||||
|  | ||||
|  | ||||
| class NoTextFoundException(Exception): | ||||
| @@ -24,7 +23,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|     def extract_metadata(self, document_path, mime_type): | ||||
|  | ||||
|         result = [] | ||||
|         if mime_type == 'application/pdf': | ||||
|         if mime_type == "application/pdf": | ||||
|             import pikepdf | ||||
|  | ||||
|             namespace_pattern = re.compile(r"\{(.*)\}(.*)") | ||||
| @@ -37,25 +36,25 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                 value = str(value) | ||||
|                 try: | ||||
|                     m = namespace_pattern.match(key) | ||||
|                     result.append({ | ||||
|                         "namespace": m.group(1), | ||||
|                         "prefix": meta.REVERSE_NS[m.group(1)], | ||||
|                         "key": m.group(2), | ||||
|                         "value": value | ||||
|                     }) | ||||
|                     result.append( | ||||
|                         { | ||||
|                             "namespace": m.group(1), | ||||
|                             "prefix": meta.REVERSE_NS[m.group(1)], | ||||
|                             "key": m.group(2), | ||||
|                             "value": value, | ||||
|                         } | ||||
|                     ) | ||||
|                 except Exception as e: | ||||
|                     self.log( | ||||
|                         "warning", | ||||
|                         f"Error while reading metadata {key}: {value}. Error: " | ||||
|                         f"{e}" | ||||
|                         f"Error while reading metadata {key}: {value}. Error: " f"{e}", | ||||
|                     ) | ||||
|         return result | ||||
|  | ||||
|     def get_thumbnail(self, document_path, mime_type, file_name=None): | ||||
|         return make_thumbnail_from_pdf( | ||||
|             self.archive_path or document_path, | ||||
|             self.tempdir, | ||||
|             self.logging_group) | ||||
|             self.archive_path or document_path, self.tempdir, self.logging_group | ||||
|         ) | ||||
|  | ||||
|     def is_image(self, mime_type): | ||||
|         return mime_type in [ | ||||
| @@ -68,17 +67,15 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|  | ||||
|     def has_alpha(self, image): | ||||
|         with Image.open(image) as im: | ||||
|             return im.mode in ('RGBA', 'LA') | ||||
|             return im.mode in ("RGBA", "LA") | ||||
|  | ||||
|     def get_dpi(self, image): | ||||
|         try: | ||||
|             with Image.open(image) as im: | ||||
|                 x, y = im.info['dpi'] | ||||
|                 x, y = im.info["dpi"] | ||||
|                 return round(x) | ||||
|         except Exception as e: | ||||
|             self.log( | ||||
|                 'warning', | ||||
|                 f"Error while getting DPI from image {image}: {e}") | ||||
|             self.log("warning", f"Error while getting DPI from image {image}: {e}") | ||||
|             return None | ||||
|  | ||||
|     def calculate_a4_dpi(self, image): | ||||
| @@ -87,16 +84,11 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                 width, height = im.size | ||||
|                 # divide image width by A4 width (210mm) in inches. | ||||
|                 dpi = int(width / (21 / 2.54)) | ||||
|                 self.log( | ||||
|                     'debug', | ||||
|                     f"Estimated DPI {dpi} based on image width {width}" | ||||
|                 ) | ||||
|                 self.log("debug", f"Estimated DPI {dpi} based on image width {width}") | ||||
|                 return dpi | ||||
|  | ||||
|         except Exception as e: | ||||
|             self.log( | ||||
|                 'warning', | ||||
|                 f"Error while calculating DPI for image {image}: {e}") | ||||
|             self.log("warning", f"Error while calculating DPI for image {image}: {e}") | ||||
|             return None | ||||
|  | ||||
|     def extract_text(self, sidecar_file, pdf_file): | ||||
| @@ -128,60 +120,60 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         except Exception: | ||||
|             # TODO catch all for various issues with PDFminer.six. | ||||
|             #  If PDFminer fails, fall back to OCR. | ||||
|             self.log("warn", | ||||
|                      "Error while getting text from PDF document with " | ||||
|                      "pdfminer.six", exc_info=True) | ||||
|             self.log( | ||||
|                 "warn", | ||||
|                 "Error while getting text from PDF document with " "pdfminer.six", | ||||
|                 exc_info=True, | ||||
|             ) | ||||
|             # probably not a PDF file. | ||||
|             return None | ||||
|  | ||||
|     def construct_ocrmypdf_parameters(self, | ||||
|                                       input_file, | ||||
|                                       mime_type, | ||||
|                                       output_file, | ||||
|                                       sidecar_file, | ||||
|                                       safe_fallback=False): | ||||
|     def construct_ocrmypdf_parameters( | ||||
|         self, input_file, mime_type, output_file, sidecar_file, safe_fallback=False | ||||
|     ): | ||||
|         ocrmypdf_args = { | ||||
|             'input_file': input_file, | ||||
|             'output_file': output_file, | ||||
|             "input_file": input_file, | ||||
|             "output_file": output_file, | ||||
|             # need to use threads, since this will be run in daemonized | ||||
|             # processes by django-q. | ||||
|             'use_threads': True, | ||||
|             'jobs': settings.THREADS_PER_WORKER, | ||||
|             'language': settings.OCR_LANGUAGE, | ||||
|             'output_type': settings.OCR_OUTPUT_TYPE, | ||||
|             'progress_bar': False | ||||
|             "use_threads": True, | ||||
|             "jobs": settings.THREADS_PER_WORKER, | ||||
|             "language": settings.OCR_LANGUAGE, | ||||
|             "output_type": settings.OCR_OUTPUT_TYPE, | ||||
|             "progress_bar": False, | ||||
|         } | ||||
|  | ||||
|         if settings.OCR_MODE == 'force' or safe_fallback: | ||||
|             ocrmypdf_args['force_ocr'] = True | ||||
|         elif settings.OCR_MODE in ['skip', 'skip_noarchive']: | ||||
|             ocrmypdf_args['skip_text'] = True | ||||
|         elif settings.OCR_MODE == 'redo': | ||||
|             ocrmypdf_args['redo_ocr'] = True | ||||
|         if settings.OCR_MODE == "force" or safe_fallback: | ||||
|             ocrmypdf_args["force_ocr"] = True | ||||
|         elif settings.OCR_MODE in ["skip", "skip_noarchive"]: | ||||
|             ocrmypdf_args["skip_text"] = True | ||||
|         elif settings.OCR_MODE == "redo": | ||||
|             ocrmypdf_args["redo_ocr"] = True | ||||
|         else: | ||||
|             raise ParseError( | ||||
|                 f"Invalid ocr mode: {settings.OCR_MODE}") | ||||
|             raise ParseError(f"Invalid ocr mode: {settings.OCR_MODE}") | ||||
|  | ||||
|         if settings.OCR_CLEAN == 'clean': | ||||
|             ocrmypdf_args['clean'] = True | ||||
|         elif settings.OCR_CLEAN == 'clean-final': | ||||
|             if settings.OCR_MODE == 'redo': | ||||
|                 ocrmypdf_args['clean'] = True | ||||
|         if settings.OCR_CLEAN == "clean": | ||||
|             ocrmypdf_args["clean"] = True | ||||
|         elif settings.OCR_CLEAN == "clean-final": | ||||
|             if settings.OCR_MODE == "redo": | ||||
|                 ocrmypdf_args["clean"] = True | ||||
|             else: | ||||
|                 ocrmypdf_args['clean_final'] = True | ||||
|                 ocrmypdf_args["clean_final"] = True | ||||
|  | ||||
|         if settings.OCR_DESKEW and not settings.OCR_MODE == 'redo': | ||||
|             ocrmypdf_args['deskew'] = True | ||||
|         if settings.OCR_DESKEW and not settings.OCR_MODE == "redo": | ||||
|             ocrmypdf_args["deskew"] = True | ||||
|  | ||||
|         if settings.OCR_ROTATE_PAGES: | ||||
|             ocrmypdf_args['rotate_pages'] = True | ||||
|             ocrmypdf_args['rotate_pages_threshold'] = settings.OCR_ROTATE_PAGES_THRESHOLD  # NOQA: E501 | ||||
|             ocrmypdf_args["rotate_pages"] = True | ||||
|             ocrmypdf_args[ | ||||
|                 "rotate_pages_threshold" | ||||
|             ] = settings.OCR_ROTATE_PAGES_THRESHOLD  # NOQA: E501 | ||||
|  | ||||
|         if settings.OCR_PAGES > 0: | ||||
|             ocrmypdf_args['pages'] = f"1-{settings.OCR_PAGES}" | ||||
|             ocrmypdf_args["pages"] = f"1-{settings.OCR_PAGES}" | ||||
|         else: | ||||
|             # sidecar is incompatible with pages | ||||
|             ocrmypdf_args['sidecar'] = sidecar_file | ||||
|             ocrmypdf_args["sidecar"] = sidecar_file | ||||
|  | ||||
|         if self.is_image(mime_type): | ||||
|             dpi = self.get_dpi(input_file) | ||||
| @@ -191,29 +183,27 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                 self.log( | ||||
|                     "info", | ||||
|                     f"Removing alpha layer from {input_file} " | ||||
|                     "for compatibility with img2pdf" | ||||
|                     "for compatibility with img2pdf", | ||||
|                 ) | ||||
|                 with Image.open(input_file) as im: | ||||
|                     background = Image.new('RGBA', im.size, (255, 255, 255)) | ||||
|                     background = Image.new("RGBA", im.size, (255, 255, 255)) | ||||
|                     background.alpha_composite(im) | ||||
|                     background = background.convert('RGB') | ||||
|                     background = background.convert("RGB") | ||||
|                     background.save(input_file, format=im.format) | ||||
|  | ||||
|             if dpi: | ||||
|                 self.log( | ||||
|                     "debug", | ||||
|                     f"Detected DPI for image {input_file}: {dpi}" | ||||
|                 ) | ||||
|                 ocrmypdf_args['image_dpi'] = dpi | ||||
|                 self.log("debug", f"Detected DPI for image {input_file}: {dpi}") | ||||
|                 ocrmypdf_args["image_dpi"] = dpi | ||||
|             elif settings.OCR_IMAGE_DPI: | ||||
|                 ocrmypdf_args['image_dpi'] = settings.OCR_IMAGE_DPI | ||||
|                 ocrmypdf_args["image_dpi"] = settings.OCR_IMAGE_DPI | ||||
|             elif a4_dpi: | ||||
|                 ocrmypdf_args['image_dpi'] = a4_dpi | ||||
|                 ocrmypdf_args["image_dpi"] = a4_dpi | ||||
|             else: | ||||
|                 raise ParseError( | ||||
|                     f"Cannot produce archive PDF for image {input_file}, " | ||||
|                     f"no DPI information is present in this image and " | ||||
|                     f"OCR_IMAGE_DPI is not set.") | ||||
|                     f"OCR_IMAGE_DPI is not set." | ||||
|                 ) | ||||
|  | ||||
|         if settings.OCR_USER_ARGS and not safe_fallback: | ||||
|             try: | ||||
| @@ -223,13 +213,14 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                 self.log( | ||||
|                     "warning", | ||||
|                     f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " | ||||
|                     f"they will not be used. Error: {e}") | ||||
|                     f"they will not be used. Error: {e}", | ||||
|                 ) | ||||
|  | ||||
|         return ocrmypdf_args | ||||
|  | ||||
|     def parse(self, document_path, mime_type, file_name=None): | ||||
|         # This forces tesseract to use one core per page. | ||||
|         os.environ['OMP_THREAD_LIMIT'] = "1" | ||||
|         os.environ["OMP_THREAD_LIMIT"] = "1" | ||||
|  | ||||
|         if mime_type == "application/pdf": | ||||
|             text_original = self.extract_text(None, document_path) | ||||
| @@ -239,8 +230,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|             original_has_text = False | ||||
|  | ||||
|         if settings.OCR_MODE == "skip_noarchive" and original_has_text: | ||||
|             self.log("debug", | ||||
|                      "Document has text, skipping OCRmyPDF entirely.") | ||||
|             self.log("debug", "Document has text, skipping OCRmyPDF entirely.") | ||||
|             self.text = text_original | ||||
|             return | ||||
|  | ||||
| @@ -251,7 +241,8 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         sidecar_file = os.path.join(self.tempdir, "sidecar.txt") | ||||
|  | ||||
|         args = self.construct_ocrmypdf_parameters( | ||||
|             document_path, mime_type, archive_path, sidecar_file) | ||||
|             document_path, mime_type, archive_path, sidecar_file | ||||
|         ) | ||||
|  | ||||
|         try: | ||||
|             self.log("debug", f"Calling OCRmyPDF with args: {args}") | ||||
| @@ -261,42 +252,45 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|             self.text = self.extract_text(sidecar_file, archive_path) | ||||
|  | ||||
|             if not self.text: | ||||
|                 raise NoTextFoundException( | ||||
|                     "No text was found in the original document") | ||||
|                 raise NoTextFoundException("No text was found in the original document") | ||||
|         except EncryptedPdfError: | ||||
|             self.log("warning", | ||||
|                      "This file is encrypted, OCR is impossible. Using " | ||||
|                      "any text present in the original file.") | ||||
|             self.log( | ||||
|                 "warning", | ||||
|                 "This file is encrypted, OCR is impossible. Using " | ||||
|                 "any text present in the original file.", | ||||
|             ) | ||||
|             if original_has_text: | ||||
|                 self.text = text_original | ||||
|         except (NoTextFoundException, InputFileError) as e: | ||||
|             self.log("warning", | ||||
|                      f"Encountered an error while running OCR: {str(e)}. " | ||||
|                      f"Attempting force OCR to get the text.") | ||||
|             self.log( | ||||
|                 "warning", | ||||
|                 f"Encountered an error while running OCR: {str(e)}. " | ||||
|                 f"Attempting force OCR to get the text.", | ||||
|             ) | ||||
|  | ||||
|             archive_path_fallback = os.path.join( | ||||
|                 self.tempdir, "archive-fallback.pdf") | ||||
|             sidecar_file_fallback = os.path.join( | ||||
|                 self.tempdir, "sidecar-fallback.txt") | ||||
|             archive_path_fallback = os.path.join(self.tempdir, "archive-fallback.pdf") | ||||
|             sidecar_file_fallback = os.path.join(self.tempdir, "sidecar-fallback.txt") | ||||
|  | ||||
|             # Attempt to run OCR with safe settings. | ||||
|  | ||||
|             args = self.construct_ocrmypdf_parameters( | ||||
|                 document_path, mime_type, | ||||
|                 archive_path_fallback, sidecar_file_fallback, | ||||
|                 safe_fallback=True | ||||
|                 document_path, | ||||
|                 mime_type, | ||||
|                 archive_path_fallback, | ||||
|                 sidecar_file_fallback, | ||||
|                 safe_fallback=True, | ||||
|             ) | ||||
|  | ||||
|             try: | ||||
|                 self.log("debug", | ||||
|                          f"Fallback: Calling OCRmyPDF with args: {args}") | ||||
|                 self.log("debug", f"Fallback: Calling OCRmyPDF with args: {args}") | ||||
|                 ocrmypdf.ocr(**args) | ||||
|  | ||||
|                 # Don't return the archived file here, since this file | ||||
|                 # is bigger and blurry due to --force-ocr. | ||||
|  | ||||
|                 self.text = self.extract_text( | ||||
|                     sidecar_file_fallback, archive_path_fallback) | ||||
|                     sidecar_file_fallback, archive_path_fallback | ||||
|                 ) | ||||
|  | ||||
|             except Exception as e: | ||||
|                 # If this fails, we have a serious issue at hand. | ||||
| @@ -315,7 +309,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                 self.log( | ||||
|                     "warning", | ||||
|                     f"No text was found in {document_path}, the content will " | ||||
|                     f"be empty." | ||||
|                     f"be empty.", | ||||
|                 ) | ||||
|                 self.text = "" | ||||
|  | ||||
| @@ -325,10 +319,8 @@ def post_process_text(text): | ||||
|         return None | ||||
|  | ||||
|     collapsed_spaces = re.sub(r"([^\S\r\n]+)", " ", text) | ||||
|     no_leading_whitespace = re.sub( | ||||
|         r"([\n\r]+)([^\S\n\r]+)", '\\1', collapsed_spaces) | ||||
|     no_trailing_whitespace = re.sub( | ||||
|         r"([^\S\n\r]+)$", '', no_leading_whitespace) | ||||
|     no_leading_whitespace = re.sub(r"([\n\r]+)([^\S\n\r]+)", "\\1", collapsed_spaces) | ||||
|     no_trailing_whitespace = re.sub(r"([^\S\n\r]+)$", "", no_leading_whitespace) | ||||
|  | ||||
|     # TODO: this needs a rework | ||||
|     # replace \0 prevents issues with saving to postgres. | ||||
|   | ||||
| @@ -1,4 +1,3 @@ | ||||
|  | ||||
| def get_parser(*args, **kwargs): | ||||
|     from .parsers import RasterisedDocumentParser | ||||
|  | ||||
| @@ -16,5 +15,5 @@ def tesseract_consumer_declaration(sender, **kwargs): | ||||
|             "image/tiff": ".tif", | ||||
|             "image/gif": ".gif", | ||||
|             "image/bmp": ".bmp", | ||||
|         } | ||||
|         }, | ||||
|     } | ||||
|   | ||||
| @@ -7,7 +7,6 @@ from paperless_tesseract import check_default_language_available | ||||
|  | ||||
|  | ||||
| class TestChecks(TestCase): | ||||
|  | ||||
|     def test_default_language(self): | ||||
|         msgs = check_default_language_available(None) | ||||
|  | ||||
| @@ -15,7 +14,11 @@ class TestChecks(TestCase): | ||||
|     def test_no_language(self): | ||||
|         msgs = check_default_language_available(None) | ||||
|         self.assertEqual(len(msgs), 1) | ||||
|         self.assertTrue(msgs[0].msg.startswith("No OCR language has been specified with PAPERLESS_OCR_LANGUAGE")) | ||||
|         self.assertTrue( | ||||
|             msgs[0].msg.startswith( | ||||
|                 "No OCR language has been specified with PAPERLESS_OCR_LANGUAGE" | ||||
|             ) | ||||
|         ) | ||||
|  | ||||
|     @override_settings(OCR_LANGUAGE="ita") | ||||
|     @mock.patch("paperless_tesseract.checks.get_tesseract_langs") | ||||
|   | ||||
| @@ -33,7 +33,6 @@ class FakeImageFile(ContextManager): | ||||
|  | ||||
|  | ||||
| class TestParser(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     def assertContainsStrings(self, content, strings): | ||||
|         # Asserts that all strings appear in content, in the given order. | ||||
|         indices = [] | ||||
| @@ -46,14 +45,8 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|  | ||||
|     text_cases = [ | ||||
|         ("simple     string", "simple string"), | ||||
|         ( | ||||
|             "simple    newline\n   testing string", | ||||
|             "simple newline\ntesting string" | ||||
|         ), | ||||
|         ( | ||||
|             "utf-8   строка с пробелами в конце  ", | ||||
|             "utf-8 строка с пробелами в конце" | ||||
|         ) | ||||
|         ("simple    newline\n   testing string", "simple newline\ntesting string"), | ||||
|         ("utf-8   строка с пробелами в конце  ", "utf-8 строка с пробелами в конце"), | ||||
|     ] | ||||
|  | ||||
|     def test_post_process_text(self): | ||||
| @@ -63,28 +56,29 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|                 result, | ||||
|                 actual_result, | ||||
|                 "strip_exceess_whitespace({}) != '{}', but '{}'".format( | ||||
|                     source, | ||||
|                     result, | ||||
|                     actual_result | ||||
|                 ) | ||||
|                     source, result, actual_result | ||||
|                 ), | ||||
|             ) | ||||
|  | ||||
|     SAMPLE_FILES = os.path.join(os.path.dirname(__file__), "samples") | ||||
|  | ||||
|     def test_get_text_from_pdf(self): | ||||
|         parser = RasterisedDocumentParser(uuid.uuid4()) | ||||
|         text = parser.extract_text(None, os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf')) | ||||
|         text = parser.extract_text( | ||||
|             None, os.path.join(self.SAMPLE_FILES, "simple-digital.pdf") | ||||
|         ) | ||||
|  | ||||
|         self.assertContainsStrings(text.strip(), ["This is a test document."]) | ||||
|  | ||||
|     def test_thumbnail(self): | ||||
|         parser = RasterisedDocumentParser(uuid.uuid4()) | ||||
|         thumb = parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf") | ||||
|         thumb = parser.get_thumbnail( | ||||
|             os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), "application/pdf" | ||||
|         ) | ||||
|         self.assertTrue(os.path.isfile(thumb)) | ||||
|  | ||||
|     @mock.patch("documents.parsers.run_convert") | ||||
|     def test_thumbnail_fallback(self, m): | ||||
|  | ||||
|         def call_convert(input_file, output_file, **kwargs): | ||||
|             if ".pdf" in input_file: | ||||
|                 raise ParseError("Does not compute.") | ||||
| @@ -94,12 +88,16 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|         m.side_effect = call_convert | ||||
|  | ||||
|         parser = RasterisedDocumentParser(uuid.uuid4()) | ||||
|         thumb = parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'simple-digital.pdf'), "application/pdf") | ||||
|         thumb = parser.get_thumbnail( | ||||
|             os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), "application/pdf" | ||||
|         ) | ||||
|         self.assertTrue(os.path.isfile(thumb)) | ||||
|  | ||||
|     def test_thumbnail_encrypted(self): | ||||
|         parser = RasterisedDocumentParser(uuid.uuid4()) | ||||
|         thumb = parser.get_thumbnail(os.path.join(self.SAMPLE_FILES, 'encrypted.pdf'), "application/pdf") | ||||
|         thumb = parser.get_thumbnail( | ||||
|             os.path.join(self.SAMPLE_FILES, "encrypted.pdf"), "application/pdf" | ||||
|         ) | ||||
|         self.assertTrue(os.path.isfile(thumb)) | ||||
|  | ||||
|     def test_get_dpi(self): | ||||
| @@ -114,7 +112,9 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|     def test_simple_digital(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), "application/pdf") | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "simple-digital.pdf"), "application/pdf" | ||||
|         ) | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|  | ||||
| @@ -123,20 +123,30 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|     def test_with_form(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf" | ||||
|         ) | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|  | ||||
|         self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."]) | ||||
|         self.assertContainsStrings( | ||||
|             parser.get_text(), | ||||
|             ["Please enter your name in here:", "This is a PDF document with a form."], | ||||
|         ) | ||||
|  | ||||
|     @override_settings(OCR_MODE="redo") | ||||
|     def test_with_form_error(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf" | ||||
|         ) | ||||
|  | ||||
|         self.assertIsNone(parser.archive_path) | ||||
|         self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."]) | ||||
|         self.assertContainsStrings( | ||||
|             parser.get_text(), | ||||
|             ["Please enter your name in here:", "This is a PDF document with a form."], | ||||
|         ) | ||||
|  | ||||
|     @override_settings(OCR_MODE="skip") | ||||
|     def test_signed(self): | ||||
| @@ -145,32 +155,49 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "signed.pdf"), "application/pdf") | ||||
|  | ||||
|         self.assertIsNone(parser.archive_path) | ||||
|         self.assertContainsStrings(parser.get_text(), ["This is a digitally signed PDF, created with Acrobat Pro for the Paperless project to enable", "automated testing of signed/encrypted PDFs"]) | ||||
|         self.assertContainsStrings( | ||||
|             parser.get_text(), | ||||
|             [ | ||||
|                 "This is a digitally signed PDF, created with Acrobat Pro for the Paperless project to enable", | ||||
|                 "automated testing of signed/encrypted PDFs", | ||||
|             ], | ||||
|         ) | ||||
|  | ||||
|     @override_settings(OCR_MODE="skip") | ||||
|     def test_encrypted(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "encrypted.pdf"), "application/pdf") | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "encrypted.pdf"), "application/pdf" | ||||
|         ) | ||||
|  | ||||
|         self.assertIsNone(parser.archive_path) | ||||
|         self.assertEqual(parser.get_text(), "") | ||||
|  | ||||
|  | ||||
|     @override_settings(OCR_MODE="redo") | ||||
|     def test_with_form_error_notext(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf" | ||||
|         ) | ||||
|  | ||||
|         self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."]) | ||||
|         self.assertContainsStrings( | ||||
|             parser.get_text(), | ||||
|             ["Please enter your name in here:", "This is a PDF document with a form."], | ||||
|         ) | ||||
|  | ||||
|     @override_settings(OCR_MODE="force") | ||||
|     def test_with_form_force(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf") | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "with-form.pdf"), "application/pdf" | ||||
|         ) | ||||
|  | ||||
|         self.assertContainsStrings(parser.get_text(), ["Please enter your name in here:", "This is a PDF document with a form."]) | ||||
|         self.assertContainsStrings( | ||||
|             parser.get_text(), | ||||
|             ["Please enter your name in here:", "This is a PDF document with a form."], | ||||
|         ) | ||||
|  | ||||
|     def test_image_simple(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
| @@ -193,7 +220,9 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|     def test_image_calc_a4_dpi(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         dpi = parser.calculate_a4_dpi(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png")) | ||||
|         dpi = parser.calculate_a4_dpi( | ||||
|             os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png") | ||||
|         ) | ||||
|  | ||||
|         self.assertEqual(dpi, 62) | ||||
|  | ||||
| @@ -203,7 +232,9 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|  | ||||
|         def f(): | ||||
|             parser.parse(os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png") | ||||
|             parser.parse( | ||||
|                 os.path.join(self.SAMPLE_FILES, "simple-no-dpi.png"), "image/png" | ||||
|             ) | ||||
|  | ||||
|         self.assertRaises(ParseError, f) | ||||
|  | ||||
| @@ -215,46 +246,70 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|  | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["this is a test document."]) | ||||
|         self.assertContainsStrings( | ||||
|             parser.get_text().lower(), ["this is a test document."] | ||||
|         ) | ||||
|  | ||||
|     def test_multi_page(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf" | ||||
|         ) | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|         self.assertContainsStrings( | ||||
|             parser.get_text().lower(), ["page 1", "page 2", "page 3"] | ||||
|         ) | ||||
|  | ||||
|     @override_settings(OCR_PAGES=2, OCR_MODE="skip") | ||||
|     def test_multi_page_pages_skip(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf" | ||||
|         ) | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|         self.assertContainsStrings( | ||||
|             parser.get_text().lower(), ["page 1", "page 2", "page 3"] | ||||
|         ) | ||||
|  | ||||
|     @override_settings(OCR_PAGES=2, OCR_MODE="redo") | ||||
|     def test_multi_page_pages_redo(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf" | ||||
|         ) | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|         self.assertContainsStrings( | ||||
|             parser.get_text().lower(), ["page 1", "page 2", "page 3"] | ||||
|         ) | ||||
|  | ||||
|     @override_settings(OCR_PAGES=2, OCR_MODE="force") | ||||
|     def test_multi_page_pages_force(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf" | ||||
|         ) | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|         self.assertContainsStrings( | ||||
|             parser.get_text().lower(), ["page 1", "page 2", "page 3"] | ||||
|         ) | ||||
|  | ||||
|     @override_settings(OOCR_MODE="skip") | ||||
|     def test_multi_page_analog_pages_skip(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf" | ||||
|         ) | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|         self.assertContainsStrings( | ||||
|             parser.get_text().lower(), ["page 1", "page 2", "page 3"] | ||||
|         ) | ||||
|  | ||||
|     @override_settings(OCR_PAGES=2, OCR_MODE="redo") | ||||
|     def test_multi_page_analog_pages_redo(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf" | ||||
|         ) | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2"]) | ||||
|         self.assertFalse("page 3" in parser.get_text().lower()) | ||||
| @@ -262,7 +317,9 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|     @override_settings(OCR_PAGES=1, OCR_MODE="force") | ||||
|     def test_multi_page_analog_pages_force(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf" | ||||
|         ) | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1"]) | ||||
|         self.assertFalse("page 2" in parser.get_text().lower()) | ||||
| @@ -271,23 +328,36 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|     @override_settings(OCR_MODE="skip_noarchive") | ||||
|     def test_skip_noarchive_withtext(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf") | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "multi-page-digital.pdf"), "application/pdf" | ||||
|         ) | ||||
|         self.assertIsNone(parser.archive_path) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|         self.assertContainsStrings( | ||||
|             parser.get_text().lower(), ["page 1", "page 2", "page 3"] | ||||
|         ) | ||||
|  | ||||
|     @override_settings(OCR_MODE="skip_noarchive") | ||||
|     def test_skip_noarchive_notext(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf") | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "multi-page-images.pdf"), "application/pdf" | ||||
|         ) | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3"]) | ||||
|         self.assertContainsStrings( | ||||
|             parser.get_text().lower(), ["page 1", "page 2", "page 3"] | ||||
|         ) | ||||
|  | ||||
|     @override_settings(OCR_MODE="skip") | ||||
|     def test_multi_page_mixed(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"), "application/pdf") | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"), "application/pdf" | ||||
|         ) | ||||
|         self.assertTrue(os.path.isfile(parser.archive_path)) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 1", "page 2", "page 3", "page 4", "page 5", "page 6"]) | ||||
|         self.assertContainsStrings( | ||||
|             parser.get_text().lower(), | ||||
|             ["page 1", "page 2", "page 3", "page 4", "page 5", "page 6"], | ||||
|         ) | ||||
|  | ||||
|         with open(os.path.join(parser.tempdir, "sidecar.txt")) as f: | ||||
|             sidecar = f.read() | ||||
| @@ -297,30 +367,41 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|     @override_settings(OCR_MODE="skip_noarchive") | ||||
|     def test_multi_page_mixed_no_archive(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"), "application/pdf") | ||||
|         parser.parse( | ||||
|             os.path.join(self.SAMPLE_FILES, "multi-page-mixed.pdf"), "application/pdf" | ||||
|         ) | ||||
|         self.assertIsNone(parser.archive_path) | ||||
|         self.assertContainsStrings(parser.get_text().lower(), ["page 4", "page 5", "page 6"]) | ||||
|         self.assertContainsStrings( | ||||
|             parser.get_text().lower(), ["page 4", "page 5", "page 6"] | ||||
|         ) | ||||
|  | ||||
|     @override_settings(OCR_MODE="skip", OCR_ROTATE_PAGES=True) | ||||
|     def test_rotate(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         parser.parse(os.path.join(self.SAMPLE_FILES, "rotated.pdf"), "application/pdf") | ||||
|         self.assertContainsStrings(parser.get_text(), [ | ||||
|             "This is the text that appears on the first page. It’s a lot of text.", | ||||
|             "Even if the pages are rotated, OCRmyPDF still gets the job done.", | ||||
|             "This is a really weird file with lots of nonsense text.", | ||||
|             "If you read this, it’s your own fault. Also check your screen orientation." | ||||
|         ]) | ||||
|         self.assertContainsStrings( | ||||
|             parser.get_text(), | ||||
|             [ | ||||
|                 "This is the text that appears on the first page. It’s a lot of text.", | ||||
|                 "Even if the pages are rotated, OCRmyPDF still gets the job done.", | ||||
|                 "This is a really weird file with lots of nonsense text.", | ||||
|                 "If you read this, it’s your own fault. Also check your screen orientation.", | ||||
|             ], | ||||
|         ) | ||||
|  | ||||
|     def test_ocrmypdf_parameters(self): | ||||
|         parser = RasterisedDocumentParser(None) | ||||
|         params = parser.construct_ocrmypdf_parameters(input_file="input.pdf", output_file="output.pdf", | ||||
|                                                       sidecar_file="sidecar.txt", mime_type="application/pdf", | ||||
|                                                       safe_fallback=False) | ||||
|         params = parser.construct_ocrmypdf_parameters( | ||||
|             input_file="input.pdf", | ||||
|             output_file="output.pdf", | ||||
|             sidecar_file="sidecar.txt", | ||||
|             mime_type="application/pdf", | ||||
|             safe_fallback=False, | ||||
|         ) | ||||
|  | ||||
|         self.assertEqual(params['input_file'], "input.pdf") | ||||
|         self.assertEqual(params['output_file'], "output.pdf") | ||||
|         self.assertEqual(params['sidecar'], "sidecar.txt") | ||||
|         self.assertEqual(params["input_file"], "input.pdf") | ||||
|         self.assertEqual(params["output_file"], "output.pdf") | ||||
|         self.assertEqual(params["sidecar"], "sidecar.txt") | ||||
|  | ||||
|         with override_settings(OCR_CLEAN="none"): | ||||
|             params = parser.construct_ocrmypdf_parameters("", "", "", "") | ||||
| @@ -329,30 +410,31 @@ class TestParser(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         with override_settings(OCR_CLEAN="clean"): | ||||
|             params = parser.construct_ocrmypdf_parameters("", "", "", "") | ||||
|             self.assertTrue(params['clean']) | ||||
|             self.assertTrue(params["clean"]) | ||||
|             self.assertNotIn("clean_final", params) | ||||
|  | ||||
|         with override_settings(OCR_CLEAN="clean-final", OCR_MODE="skip"): | ||||
|             params = parser.construct_ocrmypdf_parameters("", "", "", "") | ||||
|             self.assertTrue(params['clean_final']) | ||||
|             self.assertTrue(params["clean_final"]) | ||||
|             self.assertNotIn("clean", params) | ||||
|  | ||||
|         with override_settings(OCR_CLEAN="clean-final", OCR_MODE="redo"): | ||||
|             params = parser.construct_ocrmypdf_parameters("", "", "", "") | ||||
|             self.assertTrue(params['clean']) | ||||
|             self.assertTrue(params["clean"]) | ||||
|             self.assertNotIn("clean_final", params) | ||||
|  | ||||
|         with override_settings(OCR_DESKEW=True, OCR_MODE="skip"): | ||||
|             params = parser.construct_ocrmypdf_parameters("", "", "", "") | ||||
|             self.assertTrue(params['deskew']) | ||||
|             self.assertTrue(params["deskew"]) | ||||
|  | ||||
|         with override_settings(OCR_DESKEW=True, OCR_MODE="redo"): | ||||
|             params = parser.construct_ocrmypdf_parameters("", "", "", "") | ||||
|             self.assertNotIn('deskew', params) | ||||
|             self.assertNotIn("deskew", params) | ||||
|  | ||||
|         with override_settings(OCR_DESKEW=False, OCR_MODE="skip"): | ||||
|             params = parser.construct_ocrmypdf_parameters("", "", "", "") | ||||
|             self.assertNotIn('deskew', params) | ||||
|             self.assertNotIn("deskew", params) | ||||
|  | ||||
|  | ||||
| class TestParserFileTypes(DirectoriesMixin, TestCase): | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 kpj
					kpj