mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Improves the logging mixin and allows it to be typed better
This commit is contained in:
		| @@ -92,7 +92,7 @@ class Consumer(LoggingMixin): | ||||
|         exception: Optional[Exception] = None, | ||||
|     ): | ||||
|         self._send_progress(100, 100, "FAILED", message) | ||||
|         self.log("error", log_message or message, exc_info=exc_info) | ||||
|         self.log.error(log_message or message, exc_info=exc_info) | ||||
|         raise ConsumerError(f"{self.filename}: {log_message or message}") from exception | ||||
|  | ||||
|     def __init__(self): | ||||
| @@ -175,7 +175,7 @@ class Consumer(LoggingMixin): | ||||
|                 f"{settings.PRE_CONSUME_SCRIPT} does not exist.", | ||||
|             ) | ||||
|  | ||||
|         self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}") | ||||
|         self.log.info(f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}") | ||||
|  | ||||
|         working_file_path = str(self.path) | ||||
|         original_file_path = str(self.original_path) | ||||
| @@ -218,8 +218,7 @@ class Consumer(LoggingMixin): | ||||
|                 f"{settings.POST_CONSUME_SCRIPT} does not exist.", | ||||
|             ) | ||||
|  | ||||
|         self.log( | ||||
|             "info", | ||||
|         self.log.info( | ||||
|             f"Executing post-consume script {settings.POST_CONSUME_SCRIPT}", | ||||
|         ) | ||||
|  | ||||
| @@ -311,11 +310,6 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|         self._send_progress(0, 100, "STARTING", MESSAGE_NEW_FILE) | ||||
|  | ||||
|         # this is for grouping logging entries for this particular file | ||||
|         # together. | ||||
|  | ||||
|         self.renew_logging_group() | ||||
|  | ||||
|         # Make sure that preconditions for consuming the file are met. | ||||
|  | ||||
|         self.pre_check_file_exists() | ||||
| @@ -323,7 +317,7 @@ class Consumer(LoggingMixin): | ||||
|         self.pre_check_duplicate() | ||||
|         self.pre_check_asn_value() | ||||
|  | ||||
|         self.log("info", f"Consuming {self.filename}") | ||||
|         self.log.info(f"Consuming {self.filename}") | ||||
|  | ||||
|         # For the actual work, copy the file into a tempdir | ||||
|         self.original_path = self.path | ||||
| @@ -338,7 +332,7 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|         mime_type = magic.from_file(self.path, mime=True) | ||||
|  | ||||
|         self.log("debug", f"Detected mime type: {mime_type}") | ||||
|         self.log.debug(f"Detected mime type: {mime_type}") | ||||
|  | ||||
|         # Based on the mime type, get the parser for that type | ||||
|         parser_class: Optional[Type[DocumentParser]] = get_parser_class_for_mime_type( | ||||
| @@ -370,7 +364,7 @@ class Consumer(LoggingMixin): | ||||
|             progress_callback, | ||||
|         ) | ||||
|  | ||||
|         self.log("debug", f"Parser: {type(document_parser).__name__}") | ||||
|         self.log.debug(f"Parser: {type(document_parser).__name__}") | ||||
|  | ||||
|         # However, this already created working directories which we have to | ||||
|         # clean up. | ||||
| @@ -384,10 +378,10 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|         try: | ||||
|             self._send_progress(20, 100, "WORKING", MESSAGE_PARSING_DOCUMENT) | ||||
|             self.log("debug", f"Parsing {self.filename}...") | ||||
|             self.log.debug(f"Parsing {self.filename}...") | ||||
|             document_parser.parse(self.path, mime_type, self.filename) | ||||
|  | ||||
|             self.log("debug", f"Generating thumbnail for {self.filename}...") | ||||
|             self.log.debug(f"Generating thumbnail for {self.filename}...") | ||||
|             self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL) | ||||
|             thumbnail = document_parser.get_thumbnail( | ||||
|                 self.path, | ||||
| @@ -475,7 +469,7 @@ class Consumer(LoggingMixin): | ||||
|                 document.save() | ||||
|  | ||||
|                 # Delete the file only if it was successfully consumed | ||||
|                 self.log("debug", f"Deleting file {self.path}") | ||||
|                 self.log.debug(f"Deleting file {self.path}") | ||||
|                 os.unlink(self.path) | ||||
|                 self.original_path.unlink() | ||||
|  | ||||
| @@ -486,7 +480,7 @@ class Consumer(LoggingMixin): | ||||
|                 ) | ||||
|  | ||||
|                 if os.path.isfile(shadow_file): | ||||
|                     self.log("debug", f"Deleting file {shadow_file}") | ||||
|                     self.log.debug(f"Deleting file {shadow_file}") | ||||
|                     os.unlink(shadow_file) | ||||
|  | ||||
|         except Exception as e: | ||||
| @@ -503,7 +497,7 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|         self.run_post_consume_script(document) | ||||
|  | ||||
|         self.log("info", f"Document {document} consumption finished") | ||||
|         self.log.info(f"Document {document} consumption finished") | ||||
|  | ||||
|         self._send_progress(100, 100, "SUCCESS", MESSAGE_FINISHED, document.id) | ||||
|  | ||||
| @@ -522,26 +516,25 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|         file_info = FileInfo.from_filename(self.filename) | ||||
|  | ||||
|         self.log("debug", "Saving record to database") | ||||
|         self.log.debug("Saving record to database") | ||||
|  | ||||
|         if self.override_created is not None: | ||||
|             create_date = self.override_created | ||||
|             self.log( | ||||
|                 "debug", | ||||
|             self.log.debug( | ||||
|                 f"Creation date from post_documents parameter: {create_date}", | ||||
|             ) | ||||
|         elif file_info.created is not None: | ||||
|             create_date = file_info.created | ||||
|             self.log("debug", f"Creation date from FileInfo: {create_date}") | ||||
|             self.log.debug(f"Creation date from FileInfo: {create_date}") | ||||
|         elif date is not None: | ||||
|             create_date = date | ||||
|             self.log("debug", f"Creation date from parse_date: {create_date}") | ||||
|             self.log.debug(f"Creation date from parse_date: {create_date}") | ||||
|         else: | ||||
|             stats = os.stat(self.path) | ||||
|             create_date = timezone.make_aware( | ||||
|                 datetime.datetime.fromtimestamp(stats.st_mtime), | ||||
|             ) | ||||
|             self.log("debug", f"Creation date from st_mtime: {create_date}") | ||||
|             self.log.debug(f"Creation date from st_mtime: {create_date}") | ||||
|  | ||||
|         storage_type = Document.STORAGE_TYPE_UNENCRYPTED | ||||
|  | ||||
| @@ -595,8 +588,7 @@ class Consumer(LoggingMixin): | ||||
|         Decodes a process stdout and stderr streams and logs them to the main log | ||||
|         """ | ||||
|         # Log what the script exited as | ||||
|         self.log( | ||||
|             "info", | ||||
|         self.log.info( | ||||
|             f"{completed_process.args[0]} exited {completed_process.returncode}", | ||||
|         ) | ||||
|  | ||||
| @@ -609,9 +601,9 @@ class Consumer(LoggingMixin): | ||||
|                     "\n", | ||||
|                 ) | ||||
|             ) | ||||
|             self.log("info", "Script stdout:") | ||||
|             self.log.info("Script stdout:") | ||||
|             for line in stdout_str: | ||||
|                 self.log("info", line) | ||||
|                 self.log.info(line) | ||||
|  | ||||
|         if len(completed_process.stderr): | ||||
|             stderr_str = ( | ||||
| @@ -622,6 +614,6 @@ class Consumer(LoggingMixin): | ||||
|                 ) | ||||
|             ) | ||||
|  | ||||
|             self.log("warning", "Script stderr:") | ||||
|             self.log.warning("Script stderr:") | ||||
|             for line in stderr_str: | ||||
|                 self.log("warning", line) | ||||
|                 self.log.warning(line) | ||||
|   | ||||
| @@ -3,18 +3,16 @@ import uuid | ||||
|  | ||||
|  | ||||
| class LoggingMixin: | ||||
|     logging_group = None | ||||
|  | ||||
|     logging_name = None | ||||
|     def __init__(self) -> None: | ||||
|         self.renew_logging_group() | ||||
|  | ||||
|     def renew_logging_group(self): | ||||
|         """ | ||||
|         Creates a new UUID to group subsequent log calls together with | ||||
|         the extra data named group | ||||
|         """ | ||||
|         self.logging_group = uuid.uuid4() | ||||
|  | ||||
|     def log(self, level, message, **kwargs): | ||||
|         if self.logging_name: | ||||
|             logger = logging.getLogger(self.logging_name) | ||||
|         else: | ||||
|             name = ".".join([self.__class__.__module__, self.__class__.__name__]) | ||||
|             logger = logging.getLogger(name) | ||||
|  | ||||
|         getattr(logger, level)(message, extra={"group": self.logging_group}, **kwargs) | ||||
|         self.log = logging.LoggerAdapter( | ||||
|             logging.getLogger(self.logging_name), | ||||
|             extra={"group": self.logging_group}, | ||||
|         ) | ||||
|   | ||||
| @@ -328,7 +328,7 @@ class DocumentParser(LoggingMixin): | ||||
|         try: | ||||
|             text = filepath.read_text(encoding="utf-8") | ||||
|         except UnicodeDecodeError as e: | ||||
|             self.log("warning", f"Unicode error during text reading, continuing: {e}") | ||||
|             self.log.warning(f"Unicode error during text reading, continuing: {e}") | ||||
|             text = filepath.read_bytes().decode("utf-8", errors="replace") | ||||
|         return text | ||||
|  | ||||
| @@ -354,5 +354,5 @@ class DocumentParser(LoggingMixin): | ||||
|         return self.date | ||||
|  | ||||
|     def cleanup(self): | ||||
|         self.log("debug", f"Deleting directory {self.tempdir}") | ||||
|         self.log.debug(f"Deleting directory {self.tempdir}") | ||||
|         shutil.rmtree(self.tempdir) | ||||
|   | ||||
| @@ -832,8 +832,7 @@ class PreConsumeTestCase(TestCase): | ||||
|                 } | ||||
|                 self.assertDictEqual(environment, {**environment, **subset}) | ||||
|  | ||||
|     @mock.patch("documents.consumer.Consumer.log") | ||||
|     def test_script_with_output(self, mocked_log): | ||||
|     def test_script_with_output(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - A script which outputs to stdout and stderr | ||||
| @@ -854,14 +853,19 @@ class PreConsumeTestCase(TestCase): | ||||
|             os.chmod(script.name, st.st_mode | stat.S_IEXEC) | ||||
|  | ||||
|             with override_settings(PRE_CONSUME_SCRIPT=script.name): | ||||
|                 c = Consumer() | ||||
|                 c.path = "path-to-file" | ||||
|                 c.run_pre_consume_script() | ||||
|                 with self.assertLogs("paperless.consumer", level="INFO") as cm: | ||||
|                     c = Consumer() | ||||
|                     c.path = "path-to-file" | ||||
|  | ||||
|                 mocked_log.assert_called() | ||||
|  | ||||
|                 mocked_log.assert_any_call("info", "This message goes to stdout") | ||||
|                 mocked_log.assert_any_call("warning", "This message goes to stderr") | ||||
|                     c.run_pre_consume_script() | ||||
|                     self.assertIn( | ||||
|                         "INFO:paperless.consumer:This message goes to stdout", | ||||
|                         cm.output, | ||||
|                     ) | ||||
|                     self.assertIn( | ||||
|                         "WARNING:paperless.consumer:This message goes to stderr", | ||||
|                         cm.output, | ||||
|                     ) | ||||
|  | ||||
|     def test_script_exit_non_zero(self): | ||||
|         """ | ||||
|   | ||||
| @@ -422,7 +422,7 @@ class MailAccountHandler(LoggingMixin): | ||||
|         try: | ||||
|             return Correspondent.objects.get_or_create(name=name)[0] | ||||
|         except DatabaseError as e: | ||||
|             self.log("error", f"Error while retrieving correspondent {name}: {e}") | ||||
|             self.log.error(f"Error while retrieving correspondent {name}: {e}") | ||||
|             return None | ||||
|  | ||||
|     def _get_title(self, message, att, rule): | ||||
| @@ -468,7 +468,7 @@ class MailAccountHandler(LoggingMixin): | ||||
|  | ||||
|         self.renew_logging_group() | ||||
|  | ||||
|         self.log("debug", f"Processing mail account {account}") | ||||
|         self.log.debug(f"Processing mail account {account}") | ||||
|  | ||||
|         total_processed_files = 0 | ||||
|         try: | ||||
| @@ -480,13 +480,12 @@ class MailAccountHandler(LoggingMixin): | ||||
|                 supports_gmail_labels = "X-GM-EXT-1" in M.client.capabilities | ||||
|                 supports_auth_plain = "AUTH=PLAIN" in M.client.capabilities | ||||
|  | ||||
|                 self.log("debug", f"GMAIL Label Support: {supports_gmail_labels}") | ||||
|                 self.log("debug", f"AUTH=PLAIN Support: {supports_auth_plain}") | ||||
|                 self.log.debug(f"GMAIL Label Support: {supports_gmail_labels}") | ||||
|                 self.log.debug(f"AUTH=PLAIN Support: {supports_auth_plain}") | ||||
|  | ||||
|                 mailbox_login(M, account) | ||||
|  | ||||
|                 self.log( | ||||
|                     "debug", | ||||
|                 self.log.debug( | ||||
|                     f"Account {account}: Processing " | ||||
|                     f"{account.rules.count()} rule(s)", | ||||
|                 ) | ||||
| @@ -499,16 +498,13 @@ class MailAccountHandler(LoggingMixin): | ||||
|                             supports_gmail_labels, | ||||
|                         ) | ||||
|                     except Exception as e: | ||||
|                         self.log( | ||||
|                             "error", | ||||
|                         self.log.exception( | ||||
|                             f"Rule {rule}: Error while processing rule: {e}", | ||||
|                             exc_info=True, | ||||
|                         ) | ||||
|         except MailError: | ||||
|             raise | ||||
|         except Exception as e: | ||||
|             self.log( | ||||
|                 "error", | ||||
|             self.log.error( | ||||
|                 f"Error while retrieving mailbox {account}: {e}", | ||||
|                 exc_info=False, | ||||
|             ) | ||||
| @@ -521,21 +517,19 @@ class MailAccountHandler(LoggingMixin): | ||||
|         rule: MailRule, | ||||
|         supports_gmail_labels: bool, | ||||
|     ): | ||||
|         self.log("debug", f"Rule {rule}: Selecting folder {rule.folder}") | ||||
|         self.log.debug(f"Rule {rule}: Selecting folder {rule.folder}") | ||||
|  | ||||
|         try: | ||||
|             M.folder.set(rule.folder) | ||||
|         except MailboxFolderSelectError as err: | ||||
|             self.log( | ||||
|                 "error", | ||||
|             self.log.error( | ||||
|                 f"Unable to access folder {rule.folder}, attempting folder listing", | ||||
|             ) | ||||
|             try: | ||||
|                 for folder_info in M.folder.list(): | ||||
|                     self.log("info", f"Located folder: {folder_info.name}") | ||||
|                     self.log.info(f"Located folder: {folder_info.name}") | ||||
|             except Exception as e: | ||||
|                 self.log( | ||||
|                     "error", | ||||
|                 self.log.error( | ||||
|                     "Exception during folder listing, unable to provide list folders: " | ||||
|                     + str(e), | ||||
|                 ) | ||||
| @@ -547,8 +541,7 @@ class MailAccountHandler(LoggingMixin): | ||||
|  | ||||
|         criterias = make_criterias(rule, supports_gmail_labels) | ||||
|  | ||||
|         self.log( | ||||
|             "debug", | ||||
|         self.log.debug( | ||||
|             f"Rule {rule}: Searching folder with criteria {str(criterias)}", | ||||
|         ) | ||||
|  | ||||
| @@ -572,7 +565,7 @@ class MailAccountHandler(LoggingMixin): | ||||
|                 uid=message.uid, | ||||
|                 folder=rule.folder, | ||||
|             ).exists(): | ||||
|                 self.log("debug", f"Skipping mail {message}, already processed.") | ||||
|                 self.log.debug(f"Skipping mail {message}, already processed.") | ||||
|                 continue | ||||
|  | ||||
|             try: | ||||
| @@ -581,13 +574,11 @@ class MailAccountHandler(LoggingMixin): | ||||
|                 total_processed_files += processed_files | ||||
|                 mails_processed += 1 | ||||
|             except Exception as e: | ||||
|                 self.log( | ||||
|                     "error", | ||||
|                 self.log.exception( | ||||
|                     f"Rule {rule}: Error while processing mail {message.uid}: {e}", | ||||
|                     exc_info=True, | ||||
|                 ) | ||||
|  | ||||
|         self.log("debug", f"Rule {rule}: Processed {mails_processed} matching mail(s)") | ||||
|         self.log.debug(f"Rule {rule}: Processed {mails_processed} matching mail(s)") | ||||
|  | ||||
|         return total_processed_files | ||||
|  | ||||
| @@ -602,8 +593,7 @@ class MailAccountHandler(LoggingMixin): | ||||
|         ): | ||||
|             return processed_elements | ||||
|  | ||||
|         self.log( | ||||
|             "debug", | ||||
|         self.log.debug( | ||||
|             f"Rule {rule}: " | ||||
|             f"Processing mail {message.subject} from {message.from_} with " | ||||
|             f"{len(message.attachments)} attachment(s)", | ||||
| @@ -657,8 +647,7 @@ class MailAccountHandler(LoggingMixin): | ||||
|                 and rule.attachment_type | ||||
|                 == MailRule.AttachmentProcessing.ATTACHMENTS_ONLY | ||||
|             ): | ||||
|                 self.log( | ||||
|                     "debug", | ||||
|                 self.log.debug( | ||||
|                     f"Rule {rule}: " | ||||
|                     f"Skipping attachment {att.filename} " | ||||
|                     f"with content disposition {att.content_disposition}", | ||||
| @@ -688,8 +677,7 @@ class MailAccountHandler(LoggingMixin): | ||||
|                 with open(temp_filename, "wb") as f: | ||||
|                     f.write(att.payload) | ||||
|  | ||||
|                 self.log( | ||||
|                     "info", | ||||
|                 self.log.info( | ||||
|                     f"Rule {rule}: " | ||||
|                     f"Consuming attachment {att.filename} from mail " | ||||
|                     f"{message.subject} from {message.from_}", | ||||
| @@ -717,8 +705,7 @@ class MailAccountHandler(LoggingMixin): | ||||
|  | ||||
|                 processed_attachments += 1 | ||||
|             else: | ||||
|                 self.log( | ||||
|                     "debug", | ||||
|                 self.log.debug( | ||||
|                     f"Rule {rule}: " | ||||
|                     f"Skipping attachment {att.filename} " | ||||
|                     f"since guessed mime type {mime_type} is not supported " | ||||
| @@ -784,8 +771,7 @@ class MailAccountHandler(LoggingMixin): | ||||
|  | ||||
|             f.write(message.obj.as_bytes()) | ||||
|  | ||||
|         self.log( | ||||
|             "info", | ||||
|         self.log.info( | ||||
|             f"Rule {rule}: " | ||||
|             f"Consuming eml from mail " | ||||
|             f"{message.subject} from {message.from_}", | ||||
|   | ||||
| @@ -64,8 +64,7 @@ class MailDocumentParser(DocumentParser): | ||||
|         try: | ||||
|             mail = self.get_parsed(document_path) | ||||
|         except ParseError as e: | ||||
|             self.log( | ||||
|                 "warning", | ||||
|             self.log.warning( | ||||
|                 f"Error while fetching document metadata for {document_path}: {e}", | ||||
|             ) | ||||
|             return result | ||||
| @@ -146,7 +145,7 @@ class MailDocumentParser(DocumentParser): | ||||
|         self.archive_path = self.generate_pdf(document_path) | ||||
|  | ||||
|     def tika_parse(self, html: str): | ||||
|         self.log("info", "Sending content to Tika server") | ||||
|         self.log.info("Sending content to Tika server") | ||||
|  | ||||
|         try: | ||||
|             parsed = parser.from_buffer(html, self.tika_server) | ||||
| @@ -251,7 +250,7 @@ class MailDocumentParser(DocumentParser): | ||||
|  | ||||
|     def generate_pdf_from_mail(self, mail): | ||||
|         url = self.gotenberg_server + "/forms/chromium/convert/html" | ||||
|         self.log("info", "Converting mail to PDF") | ||||
|         self.log.info("Converting mail to PDF") | ||||
|  | ||||
|         css_file = os.path.join(os.path.dirname(__file__), "templates/output.css") | ||||
|  | ||||
| @@ -320,7 +319,7 @@ class MailDocumentParser(DocumentParser): | ||||
|  | ||||
|     def generate_pdf_from_html(self, orig_html, attachments): | ||||
|         url = self.gotenberg_server + "/forms/chromium/convert/html" | ||||
|         self.log("info", "Converting html to PDF") | ||||
|         self.log.info("Converting html to PDF") | ||||
|  | ||||
|         files = {} | ||||
|         for name, file in self.transform_inline_html(orig_html, attachments): | ||||
|   | ||||
| @@ -129,8 +129,7 @@ class TestParser(FileSystemAssertsMixin, TestCase): | ||||
|         ) | ||||
|         self.assertEqual(mocked_return, thumb) | ||||
|  | ||||
|     @mock.patch("documents.loggers.LoggingMixin.log") | ||||
|     def test_extract_metadata_fail(self, m: mock.MagicMock): | ||||
|     def test_extract_metadata_fail(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - Fresh start | ||||
| @@ -140,8 +139,12 @@ class TestParser(FileSystemAssertsMixin, TestCase): | ||||
|             - A log warning should be generated | ||||
|         """ | ||||
|         # Validate if warning is logged when parsing fails | ||||
|         self.assertEqual([], self.parser.extract_metadata("na", "message/rfc822")) | ||||
|         self.assertEqual("warning", m.call_args[0][0]) | ||||
|         with self.assertLogs("paperless.parsing.mail", level="WARNING") as cm: | ||||
|             self.assertEqual([], self.parser.extract_metadata("na", "message/rfc822")) | ||||
|             self.assertIn( | ||||
|                 "WARNING:paperless.parsing.mail:Error while fetching document metadata for na", | ||||
|                 cm.output[0], | ||||
|             ) | ||||
|  | ||||
|     def test_extract_metadata(self): | ||||
|         """ | ||||
|   | ||||
| @@ -54,8 +54,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                         }, | ||||
|                     ) | ||||
|                 except Exception as e: | ||||
|                     self.log( | ||||
|                         "warning", | ||||
|                     self.log.warning( | ||||
|                         f"Error while reading metadata {key}: {value}. Error: {e}", | ||||
|                     ) | ||||
|         return result | ||||
| @@ -98,7 +97,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                 x, y = im.info["dpi"] | ||||
|                 return round(x) | ||||
|         except Exception as e: | ||||
|             self.log("warning", f"Error while getting DPI from image {image}: {e}") | ||||
|             self.log.warning(f"Error while getting DPI from image {image}: {e}") | ||||
|             return None | ||||
|  | ||||
|     def calculate_a4_dpi(self, image): | ||||
| @@ -107,11 +106,11 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                 width, height = im.size | ||||
|                 # divide image width by A4 width (210mm) in inches. | ||||
|                 dpi = int(width / (21 / 2.54)) | ||||
|                 self.log("debug", f"Estimated DPI {dpi} based on image width {width}") | ||||
|                 self.log.debug(f"Estimated DPI {dpi} based on image width {width}") | ||||
|                 return dpi | ||||
|  | ||||
|         except Exception as e: | ||||
|             self.log("warning", f"Error while calculating DPI for image {image}: {e}") | ||||
|             self.log.warning(f"Error while calculating DPI for image {image}: {e}") | ||||
|             return None | ||||
|  | ||||
|     def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path): | ||||
| @@ -127,10 +126,10 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|             if "[OCR skipped on page" not in text: | ||||
|                 # This happens when there's already text in the input file. | ||||
|                 # The sidecar file will only contain text for OCR'ed pages. | ||||
|                 self.log("debug", "Using text from sidecar file") | ||||
|                 self.log.debug("Using text from sidecar file") | ||||
|                 return post_process_text(text) | ||||
|             else: | ||||
|                 self.log("debug", "Incomplete sidecar file: discarding.") | ||||
|                 self.log.debug("Incomplete sidecar file: discarding.") | ||||
|  | ||||
|         # no success with the sidecar file, try PDF | ||||
|  | ||||
| @@ -160,8 +159,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|  | ||||
|         except Exception: | ||||
|             #  If pdftotext fails, fall back to OCR. | ||||
|             self.log( | ||||
|                 "warning", | ||||
|             self.log.warning( | ||||
|                 "Error while getting text from PDF document with pdftotext", | ||||
|                 exc_info=True, | ||||
|             ) | ||||
| @@ -227,15 +225,14 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|             a4_dpi = self.calculate_a4_dpi(input_file) | ||||
|  | ||||
|             if self.has_alpha(input_file): | ||||
|                 self.log( | ||||
|                     "info", | ||||
|                 self.log.info( | ||||
|                     f"Removing alpha layer from {input_file} " | ||||
|                     "for compatibility with img2pdf", | ||||
|                 ) | ||||
|                 self.remove_alpha(input_file) | ||||
|  | ||||
|             if dpi: | ||||
|                 self.log("debug", f"Detected DPI for image {input_file}: {dpi}") | ||||
|                 self.log.debug(f"Detected DPI for image {input_file}: {dpi}") | ||||
|                 ocrmypdf_args["image_dpi"] = dpi | ||||
|             elif settings.OCR_IMAGE_DPI: | ||||
|                 ocrmypdf_args["image_dpi"] = settings.OCR_IMAGE_DPI | ||||
| @@ -253,8 +250,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|                 user_args = json.loads(settings.OCR_USER_ARGS) | ||||
|                 ocrmypdf_args = {**ocrmypdf_args, **user_args} | ||||
|             except Exception as e: | ||||
|                 self.log( | ||||
|                     "warning", | ||||
|                 self.log.warning( | ||||
|                     f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " | ||||
|                     f"they will not be used. Error: {e}", | ||||
|                 ) | ||||
| @@ -263,15 +259,13 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|             # Convert pixels to mega-pixels and provide to ocrmypdf | ||||
|             max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0 | ||||
|             if max_pixels_mpixels > 0: | ||||
|                 self.log( | ||||
|                     "debug", | ||||
|                 self.log.debug( | ||||
|                     f"Calculated {max_pixels_mpixels} megapixels for OCR", | ||||
|                 ) | ||||
|  | ||||
|                 ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels | ||||
|             else: | ||||
|                 self.log( | ||||
|                     "warning", | ||||
|                 self.log.warning( | ||||
|                     "There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, " | ||||
|                     "this value must be at least 1 megapixel if set", | ||||
|                 ) | ||||
| @@ -299,7 +293,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|             or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"] | ||||
|         ) | ||||
|         if skip_archive_for_text and original_has_text: | ||||
|             self.log("debug", "Document has text, skipping OCRmyPDF entirely.") | ||||
|             self.log.debug("Document has text, skipping OCRmyPDF entirely.") | ||||
|             self.text = text_original | ||||
|             return | ||||
|  | ||||
| @@ -322,7 +316,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|         ) | ||||
|  | ||||
|         try: | ||||
|             self.log("debug", f"Calling OCRmyPDF with args: {args}") | ||||
|             self.log.debug(f"Calling OCRmyPDF with args: {args}") | ||||
|             ocrmypdf.ocr(**args) | ||||
|  | ||||
|             if settings.OCR_SKIP_ARCHIVE_FILE != "always": | ||||
| @@ -333,16 +327,14 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|             if not self.text: | ||||
|                 raise NoTextFoundException("No text was found in the original document") | ||||
|         except EncryptedPdfError: | ||||
|             self.log( | ||||
|                 "warning", | ||||
|             self.log.warning( | ||||
|                 "This file is encrypted, OCR is impossible. Using " | ||||
|                 "any text present in the original file.", | ||||
|             ) | ||||
|             if original_has_text: | ||||
|                 self.text = text_original | ||||
|         except (NoTextFoundException, InputFileError) as e: | ||||
|             self.log( | ||||
|                 "warning", | ||||
|             self.log.warning( | ||||
|                 f"Encountered an error while running OCR: {str(e)}. " | ||||
|                 f"Attempting force OCR to get the text.", | ||||
|             ) | ||||
| @@ -365,7 +357,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|             ) | ||||
|  | ||||
|             try: | ||||
|                 self.log("debug", f"Fallback: Calling OCRmyPDF with args: {args}") | ||||
|                 self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}") | ||||
|                 ocrmypdf.ocr(**args) | ||||
|  | ||||
|                 # Don't return the archived file here, since this file | ||||
| @@ -390,8 +382,7 @@ class RasterisedDocumentParser(DocumentParser): | ||||
|             if original_has_text: | ||||
|                 self.text = text_original | ||||
|             else: | ||||
|                 self.log( | ||||
|                     "warning", | ||||
|                 self.log.warning( | ||||
|                     f"No text was found in {document_path}, the content will " | ||||
|                     f"be empty.", | ||||
|                 ) | ||||
|   | ||||
| @@ -38,8 +38,7 @@ class TikaDocumentParser(DocumentParser): | ||||
|         try: | ||||
|             parsed = parser.from_file(document_path, tika_server) | ||||
|         except Exception as e: | ||||
|             self.log( | ||||
|                 "warning", | ||||
|             self.log.warning( | ||||
|                 f"Error while fetching document metadata for {document_path}: {e}", | ||||
|             ) | ||||
|             return [] | ||||
| @@ -55,7 +54,7 @@ class TikaDocumentParser(DocumentParser): | ||||
|         ] | ||||
|  | ||||
|     def parse(self, document_path: Path, mime_type, file_name=None): | ||||
|         self.log("info", f"Sending {document_path} to Tika server") | ||||
|         self.log.info(f"Sending {document_path} to Tika server") | ||||
|         tika_server = settings.TIKA_ENDPOINT | ||||
|  | ||||
|         # tika does not support a PathLike, only strings | ||||
| @@ -75,8 +74,7 @@ class TikaDocumentParser(DocumentParser): | ||||
|         try: | ||||
|             self.date = dateutil.parser.isoparse(parsed["metadata"]["Creation-Date"]) | ||||
|         except Exception as e: | ||||
|             self.log( | ||||
|                 "warning", | ||||
|             self.log.warning( | ||||
|                 f"Unable to extract date for document {document_path}: {e}", | ||||
|             ) | ||||
|  | ||||
| @@ -87,7 +85,7 @@ class TikaDocumentParser(DocumentParser): | ||||
|         gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT | ||||
|         url = gotenberg_server + "/forms/libreoffice/convert" | ||||
|  | ||||
|         self.log("info", f"Converting {document_path} to PDF as {pdf_path}") | ||||
|         self.log.info(f"Converting {document_path} to PDF as {pdf_path}") | ||||
|         with open(document_path, "rb") as document_handle: | ||||
|             files = { | ||||
|                 "files": ( | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Trenton H
					Trenton H