Improves the logging mixin and allows it to be typed better

This commit is contained in:
Trenton H 2023-05-19 15:23:11 -07:00
parent 37959fe31c
commit 452c79f9a1
9 changed files with 99 additions and 128 deletions

View File

@ -92,7 +92,7 @@ class Consumer(LoggingMixin):
exception: Optional[Exception] = None, exception: Optional[Exception] = None,
): ):
self._send_progress(100, 100, "FAILED", message) self._send_progress(100, 100, "FAILED", message)
self.log("error", log_message or message, exc_info=exc_info) self.log.error(log_message or message, exc_info=exc_info)
raise ConsumerError(f"{self.filename}: {log_message or message}") from exception raise ConsumerError(f"{self.filename}: {log_message or message}") from exception
def __init__(self): def __init__(self):
@ -175,7 +175,7 @@ class Consumer(LoggingMixin):
f"{settings.PRE_CONSUME_SCRIPT} does not exist.", f"{settings.PRE_CONSUME_SCRIPT} does not exist.",
) )
self.log("info", f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}") self.log.info(f"Executing pre-consume script {settings.PRE_CONSUME_SCRIPT}")
working_file_path = str(self.path) working_file_path = str(self.path)
original_file_path = str(self.original_path) original_file_path = str(self.original_path)
@ -218,8 +218,7 @@ class Consumer(LoggingMixin):
f"{settings.POST_CONSUME_SCRIPT} does not exist.", f"{settings.POST_CONSUME_SCRIPT} does not exist.",
) )
self.log( self.log.info(
"info",
f"Executing post-consume script {settings.POST_CONSUME_SCRIPT}", f"Executing post-consume script {settings.POST_CONSUME_SCRIPT}",
) )
@ -311,11 +310,6 @@ class Consumer(LoggingMixin):
self._send_progress(0, 100, "STARTING", MESSAGE_NEW_FILE) self._send_progress(0, 100, "STARTING", MESSAGE_NEW_FILE)
# this is for grouping logging entries for this particular file
# together.
self.renew_logging_group()
# Make sure that preconditions for consuming the file are met. # Make sure that preconditions for consuming the file are met.
self.pre_check_file_exists() self.pre_check_file_exists()
@ -323,7 +317,7 @@ class Consumer(LoggingMixin):
self.pre_check_duplicate() self.pre_check_duplicate()
self.pre_check_asn_value() self.pre_check_asn_value()
self.log("info", f"Consuming {self.filename}") self.log.info(f"Consuming {self.filename}")
# For the actual work, copy the file into a tempdir # For the actual work, copy the file into a tempdir
self.original_path = self.path self.original_path = self.path
@ -338,7 +332,7 @@ class Consumer(LoggingMixin):
mime_type = magic.from_file(self.path, mime=True) mime_type = magic.from_file(self.path, mime=True)
self.log("debug", f"Detected mime type: {mime_type}") self.log.debug(f"Detected mime type: {mime_type}")
# Based on the mime type, get the parser for that type # Based on the mime type, get the parser for that type
parser_class: Optional[Type[DocumentParser]] = get_parser_class_for_mime_type( parser_class: Optional[Type[DocumentParser]] = get_parser_class_for_mime_type(
@ -370,7 +364,7 @@ class Consumer(LoggingMixin):
progress_callback, progress_callback,
) )
self.log("debug", f"Parser: {type(document_parser).__name__}") self.log.debug(f"Parser: {type(document_parser).__name__}")
# However, this already created working directories which we have to # However, this already created working directories which we have to
# clean up. # clean up.
@ -384,10 +378,10 @@ class Consumer(LoggingMixin):
try: try:
self._send_progress(20, 100, "WORKING", MESSAGE_PARSING_DOCUMENT) self._send_progress(20, 100, "WORKING", MESSAGE_PARSING_DOCUMENT)
self.log("debug", f"Parsing {self.filename}...") self.log.debug(f"Parsing {self.filename}...")
document_parser.parse(self.path, mime_type, self.filename) document_parser.parse(self.path, mime_type, self.filename)
self.log("debug", f"Generating thumbnail for {self.filename}...") self.log.debug(f"Generating thumbnail for {self.filename}...")
self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL) self._send_progress(70, 100, "WORKING", MESSAGE_GENERATING_THUMBNAIL)
thumbnail = document_parser.get_thumbnail( thumbnail = document_parser.get_thumbnail(
self.path, self.path,
@ -475,7 +469,7 @@ class Consumer(LoggingMixin):
document.save() document.save()
# Delete the file only if it was successfully consumed # Delete the file only if it was successfully consumed
self.log("debug", f"Deleting file {self.path}") self.log.debug(f"Deleting file {self.path}")
os.unlink(self.path) os.unlink(self.path)
self.original_path.unlink() self.original_path.unlink()
@ -486,7 +480,7 @@ class Consumer(LoggingMixin):
) )
if os.path.isfile(shadow_file): if os.path.isfile(shadow_file):
self.log("debug", f"Deleting file {shadow_file}") self.log.debug(f"Deleting file {shadow_file}")
os.unlink(shadow_file) os.unlink(shadow_file)
except Exception as e: except Exception as e:
@ -503,7 +497,7 @@ class Consumer(LoggingMixin):
self.run_post_consume_script(document) self.run_post_consume_script(document)
self.log("info", f"Document {document} consumption finished") self.log.info(f"Document {document} consumption finished")
self._send_progress(100, 100, "SUCCESS", MESSAGE_FINISHED, document.id) self._send_progress(100, 100, "SUCCESS", MESSAGE_FINISHED, document.id)
@ -522,26 +516,25 @@ class Consumer(LoggingMixin):
file_info = FileInfo.from_filename(self.filename) file_info = FileInfo.from_filename(self.filename)
self.log("debug", "Saving record to database") self.log.debug("Saving record to database")
if self.override_created is not None: if self.override_created is not None:
create_date = self.override_created create_date = self.override_created
self.log( self.log.debug(
"debug",
f"Creation date from post_documents parameter: {create_date}", f"Creation date from post_documents parameter: {create_date}",
) )
elif file_info.created is not None: elif file_info.created is not None:
create_date = file_info.created create_date = file_info.created
self.log("debug", f"Creation date from FileInfo: {create_date}") self.log.debug(f"Creation date from FileInfo: {create_date}")
elif date is not None: elif date is not None:
create_date = date create_date = date
self.log("debug", f"Creation date from parse_date: {create_date}") self.log.debug(f"Creation date from parse_date: {create_date}")
else: else:
stats = os.stat(self.path) stats = os.stat(self.path)
create_date = timezone.make_aware( create_date = timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime), datetime.datetime.fromtimestamp(stats.st_mtime),
) )
self.log("debug", f"Creation date from st_mtime: {create_date}") self.log.debug(f"Creation date from st_mtime: {create_date}")
storage_type = Document.STORAGE_TYPE_UNENCRYPTED storage_type = Document.STORAGE_TYPE_UNENCRYPTED
@ -595,8 +588,7 @@ class Consumer(LoggingMixin):
Decodes a process stdout and stderr streams and logs them to the main log Decodes a process stdout and stderr streams and logs them to the main log
""" """
# Log what the script exited as # Log what the script exited as
self.log( self.log.info(
"info",
f"{completed_process.args[0]} exited {completed_process.returncode}", f"{completed_process.args[0]} exited {completed_process.returncode}",
) )
@ -609,9 +601,9 @@ class Consumer(LoggingMixin):
"\n", "\n",
) )
) )
self.log("info", "Script stdout:") self.log.info("Script stdout:")
for line in stdout_str: for line in stdout_str:
self.log("info", line) self.log.info(line)
if len(completed_process.stderr): if len(completed_process.stderr):
stderr_str = ( stderr_str = (
@ -622,6 +614,6 @@ class Consumer(LoggingMixin):
) )
) )
self.log("warning", "Script stderr:") self.log.warning("Script stderr:")
for line in stderr_str: for line in stderr_str:
self.log("warning", line) self.log.warning(line)

View File

@ -3,18 +3,16 @@ import uuid
class LoggingMixin: class LoggingMixin:
logging_group = None def __init__(self) -> None:
self.renew_logging_group()
logging_name = None
def renew_logging_group(self): def renew_logging_group(self):
"""
Creates a new UUID to group subsequent log calls together with
the extra data named group
"""
self.logging_group = uuid.uuid4() self.logging_group = uuid.uuid4()
self.log = logging.LoggerAdapter(
def log(self, level, message, **kwargs): logging.getLogger(self.logging_name),
if self.logging_name: extra={"group": self.logging_group},
logger = logging.getLogger(self.logging_name) )
else:
name = ".".join([self.__class__.__module__, self.__class__.__name__])
logger = logging.getLogger(name)
getattr(logger, level)(message, extra={"group": self.logging_group}, **kwargs)

View File

@ -328,7 +328,7 @@ class DocumentParser(LoggingMixin):
try: try:
text = filepath.read_text(encoding="utf-8") text = filepath.read_text(encoding="utf-8")
except UnicodeDecodeError as e: except UnicodeDecodeError as e:
self.log("warning", f"Unicode error during text reading, continuing: {e}") self.log.warning(f"Unicode error during text reading, continuing: {e}")
text = filepath.read_bytes().decode("utf-8", errors="replace") text = filepath.read_bytes().decode("utf-8", errors="replace")
return text return text
@ -354,5 +354,5 @@ class DocumentParser(LoggingMixin):
return self.date return self.date
def cleanup(self): def cleanup(self):
self.log("debug", f"Deleting directory {self.tempdir}") self.log.debug(f"Deleting directory {self.tempdir}")
shutil.rmtree(self.tempdir) shutil.rmtree(self.tempdir)

View File

@ -832,8 +832,7 @@ class PreConsumeTestCase(TestCase):
} }
self.assertDictEqual(environment, {**environment, **subset}) self.assertDictEqual(environment, {**environment, **subset})
@mock.patch("documents.consumer.Consumer.log") def test_script_with_output(self):
def test_script_with_output(self, mocked_log):
""" """
GIVEN: GIVEN:
- A script which outputs to stdout and stderr - A script which outputs to stdout and stderr
@ -854,14 +853,19 @@ class PreConsumeTestCase(TestCase):
os.chmod(script.name, st.st_mode | stat.S_IEXEC) os.chmod(script.name, st.st_mode | stat.S_IEXEC)
with override_settings(PRE_CONSUME_SCRIPT=script.name): with override_settings(PRE_CONSUME_SCRIPT=script.name):
with self.assertLogs("paperless.consumer", level="INFO") as cm:
c = Consumer() c = Consumer()
c.path = "path-to-file" c.path = "path-to-file"
c.run_pre_consume_script() c.run_pre_consume_script()
self.assertIn(
mocked_log.assert_called() "INFO:paperless.consumer:This message goes to stdout",
cm.output,
mocked_log.assert_any_call("info", "This message goes to stdout") )
mocked_log.assert_any_call("warning", "This message goes to stderr") self.assertIn(
"WARNING:paperless.consumer:This message goes to stderr",
cm.output,
)
def test_script_exit_non_zero(self): def test_script_exit_non_zero(self):
""" """

View File

@ -422,7 +422,7 @@ class MailAccountHandler(LoggingMixin):
try: try:
return Correspondent.objects.get_or_create(name=name)[0] return Correspondent.objects.get_or_create(name=name)[0]
except DatabaseError as e: except DatabaseError as e:
self.log("error", f"Error while retrieving correspondent {name}: {e}") self.log.error(f"Error while retrieving correspondent {name}: {e}")
return None return None
def _get_title(self, message, att, rule): def _get_title(self, message, att, rule):
@ -468,7 +468,7 @@ class MailAccountHandler(LoggingMixin):
self.renew_logging_group() self.renew_logging_group()
self.log("debug", f"Processing mail account {account}") self.log.debug(f"Processing mail account {account}")
total_processed_files = 0 total_processed_files = 0
try: try:
@ -480,13 +480,12 @@ class MailAccountHandler(LoggingMixin):
supports_gmail_labels = "X-GM-EXT-1" in M.client.capabilities supports_gmail_labels = "X-GM-EXT-1" in M.client.capabilities
supports_auth_plain = "AUTH=PLAIN" in M.client.capabilities supports_auth_plain = "AUTH=PLAIN" in M.client.capabilities
self.log("debug", f"GMAIL Label Support: {supports_gmail_labels}") self.log.debug(f"GMAIL Label Support: {supports_gmail_labels}")
self.log("debug", f"AUTH=PLAIN Support: {supports_auth_plain}") self.log.debug(f"AUTH=PLAIN Support: {supports_auth_plain}")
mailbox_login(M, account) mailbox_login(M, account)
self.log( self.log.debug(
"debug",
f"Account {account}: Processing " f"Account {account}: Processing "
f"{account.rules.count()} rule(s)", f"{account.rules.count()} rule(s)",
) )
@ -499,16 +498,13 @@ class MailAccountHandler(LoggingMixin):
supports_gmail_labels, supports_gmail_labels,
) )
except Exception as e: except Exception as e:
self.log( self.log.exception(
"error",
f"Rule {rule}: Error while processing rule: {e}", f"Rule {rule}: Error while processing rule: {e}",
exc_info=True,
) )
except MailError: except MailError:
raise raise
except Exception as e: except Exception as e:
self.log( self.log.error(
"error",
f"Error while retrieving mailbox {account}: {e}", f"Error while retrieving mailbox {account}: {e}",
exc_info=False, exc_info=False,
) )
@ -521,21 +517,19 @@ class MailAccountHandler(LoggingMixin):
rule: MailRule, rule: MailRule,
supports_gmail_labels: bool, supports_gmail_labels: bool,
): ):
self.log("debug", f"Rule {rule}: Selecting folder {rule.folder}") self.log.debug(f"Rule {rule}: Selecting folder {rule.folder}")
try: try:
M.folder.set(rule.folder) M.folder.set(rule.folder)
except MailboxFolderSelectError as err: except MailboxFolderSelectError as err:
self.log( self.log.error(
"error",
f"Unable to access folder {rule.folder}, attempting folder listing", f"Unable to access folder {rule.folder}, attempting folder listing",
) )
try: try:
for folder_info in M.folder.list(): for folder_info in M.folder.list():
self.log("info", f"Located folder: {folder_info.name}") self.log.info(f"Located folder: {folder_info.name}")
except Exception as e: except Exception as e:
self.log( self.log.error(
"error",
"Exception during folder listing, unable to provide list folders: " "Exception during folder listing, unable to provide list folders: "
+ str(e), + str(e),
) )
@ -547,8 +541,7 @@ class MailAccountHandler(LoggingMixin):
criterias = make_criterias(rule, supports_gmail_labels) criterias = make_criterias(rule, supports_gmail_labels)
self.log( self.log.debug(
"debug",
f"Rule {rule}: Searching folder with criteria {str(criterias)}", f"Rule {rule}: Searching folder with criteria {str(criterias)}",
) )
@ -572,7 +565,7 @@ class MailAccountHandler(LoggingMixin):
uid=message.uid, uid=message.uid,
folder=rule.folder, folder=rule.folder,
).exists(): ).exists():
self.log("debug", f"Skipping mail {message}, already processed.") self.log.debug(f"Skipping mail {message}, already processed.")
continue continue
try: try:
@ -581,13 +574,11 @@ class MailAccountHandler(LoggingMixin):
total_processed_files += processed_files total_processed_files += processed_files
mails_processed += 1 mails_processed += 1
except Exception as e: except Exception as e:
self.log( self.log.exception(
"error",
f"Rule {rule}: Error while processing mail {message.uid}: {e}", f"Rule {rule}: Error while processing mail {message.uid}: {e}",
exc_info=True,
) )
self.log("debug", f"Rule {rule}: Processed {mails_processed} matching mail(s)") self.log.debug(f"Rule {rule}: Processed {mails_processed} matching mail(s)")
return total_processed_files return total_processed_files
@ -602,8 +593,7 @@ class MailAccountHandler(LoggingMixin):
): ):
return processed_elements return processed_elements
self.log( self.log.debug(
"debug",
f"Rule {rule}: " f"Rule {rule}: "
f"Processing mail {message.subject} from {message.from_} with " f"Processing mail {message.subject} from {message.from_} with "
f"{len(message.attachments)} attachment(s)", f"{len(message.attachments)} attachment(s)",
@ -657,8 +647,7 @@ class MailAccountHandler(LoggingMixin):
and rule.attachment_type and rule.attachment_type
== MailRule.AttachmentProcessing.ATTACHMENTS_ONLY == MailRule.AttachmentProcessing.ATTACHMENTS_ONLY
): ):
self.log( self.log.debug(
"debug",
f"Rule {rule}: " f"Rule {rule}: "
f"Skipping attachment {att.filename} " f"Skipping attachment {att.filename} "
f"with content disposition {att.content_disposition}", f"with content disposition {att.content_disposition}",
@ -688,8 +677,7 @@ class MailAccountHandler(LoggingMixin):
with open(temp_filename, "wb") as f: with open(temp_filename, "wb") as f:
f.write(att.payload) f.write(att.payload)
self.log( self.log.info(
"info",
f"Rule {rule}: " f"Rule {rule}: "
f"Consuming attachment {att.filename} from mail " f"Consuming attachment {att.filename} from mail "
f"{message.subject} from {message.from_}", f"{message.subject} from {message.from_}",
@ -717,8 +705,7 @@ class MailAccountHandler(LoggingMixin):
processed_attachments += 1 processed_attachments += 1
else: else:
self.log( self.log.debug(
"debug",
f"Rule {rule}: " f"Rule {rule}: "
f"Skipping attachment {att.filename} " f"Skipping attachment {att.filename} "
f"since guessed mime type {mime_type} is not supported " f"since guessed mime type {mime_type} is not supported "
@ -784,8 +771,7 @@ class MailAccountHandler(LoggingMixin):
f.write(message.obj.as_bytes()) f.write(message.obj.as_bytes())
self.log( self.log.info(
"info",
f"Rule {rule}: " f"Rule {rule}: "
f"Consuming eml from mail " f"Consuming eml from mail "
f"{message.subject} from {message.from_}", f"{message.subject} from {message.from_}",

View File

@ -64,8 +64,7 @@ class MailDocumentParser(DocumentParser):
try: try:
mail = self.get_parsed(document_path) mail = self.get_parsed(document_path)
except ParseError as e: except ParseError as e:
self.log( self.log.warning(
"warning",
f"Error while fetching document metadata for {document_path}: {e}", f"Error while fetching document metadata for {document_path}: {e}",
) )
return result return result
@ -146,7 +145,7 @@ class MailDocumentParser(DocumentParser):
self.archive_path = self.generate_pdf(document_path) self.archive_path = self.generate_pdf(document_path)
def tika_parse(self, html: str): def tika_parse(self, html: str):
self.log("info", "Sending content to Tika server") self.log.info("Sending content to Tika server")
try: try:
parsed = parser.from_buffer(html, self.tika_server) parsed = parser.from_buffer(html, self.tika_server)
@ -251,7 +250,7 @@ class MailDocumentParser(DocumentParser):
def generate_pdf_from_mail(self, mail): def generate_pdf_from_mail(self, mail):
url = self.gotenberg_server + "/forms/chromium/convert/html" url = self.gotenberg_server + "/forms/chromium/convert/html"
self.log("info", "Converting mail to PDF") self.log.info("Converting mail to PDF")
css_file = os.path.join(os.path.dirname(__file__), "templates/output.css") css_file = os.path.join(os.path.dirname(__file__), "templates/output.css")
@ -320,7 +319,7 @@ class MailDocumentParser(DocumentParser):
def generate_pdf_from_html(self, orig_html, attachments): def generate_pdf_from_html(self, orig_html, attachments):
url = self.gotenberg_server + "/forms/chromium/convert/html" url = self.gotenberg_server + "/forms/chromium/convert/html"
self.log("info", "Converting html to PDF") self.log.info("Converting html to PDF")
files = {} files = {}
for name, file in self.transform_inline_html(orig_html, attachments): for name, file in self.transform_inline_html(orig_html, attachments):

View File

@ -129,8 +129,7 @@ class TestParser(FileSystemAssertsMixin, TestCase):
) )
self.assertEqual(mocked_return, thumb) self.assertEqual(mocked_return, thumb)
@mock.patch("documents.loggers.LoggingMixin.log") def test_extract_metadata_fail(self):
def test_extract_metadata_fail(self, m: mock.MagicMock):
""" """
GIVEN: GIVEN:
- Fresh start - Fresh start
@ -140,8 +139,12 @@ class TestParser(FileSystemAssertsMixin, TestCase):
- A log warning should be generated - A log warning should be generated
""" """
# Validate if warning is logged when parsing fails # Validate if warning is logged when parsing fails
with self.assertLogs("paperless.parsing.mail", level="WARNING") as cm:
self.assertEqual([], self.parser.extract_metadata("na", "message/rfc822")) self.assertEqual([], self.parser.extract_metadata("na", "message/rfc822"))
self.assertEqual("warning", m.call_args[0][0]) self.assertIn(
"WARNING:paperless.parsing.mail:Error while fetching document metadata for na",
cm.output[0],
)
def test_extract_metadata(self): def test_extract_metadata(self):
""" """

View File

@ -54,8 +54,7 @@ class RasterisedDocumentParser(DocumentParser):
}, },
) )
except Exception as e: except Exception as e:
self.log( self.log.warning(
"warning",
f"Error while reading metadata {key}: {value}. Error: {e}", f"Error while reading metadata {key}: {value}. Error: {e}",
) )
return result return result
@ -98,7 +97,7 @@ class RasterisedDocumentParser(DocumentParser):
x, y = im.info["dpi"] x, y = im.info["dpi"]
return round(x) return round(x)
except Exception as e: except Exception as e:
self.log("warning", f"Error while getting DPI from image {image}: {e}") self.log.warning(f"Error while getting DPI from image {image}: {e}")
return None return None
def calculate_a4_dpi(self, image): def calculate_a4_dpi(self, image):
@ -107,11 +106,11 @@ class RasterisedDocumentParser(DocumentParser):
width, height = im.size width, height = im.size
# divide image width by A4 width (210mm) in inches. # divide image width by A4 width (210mm) in inches.
dpi = int(width / (21 / 2.54)) dpi = int(width / (21 / 2.54))
self.log("debug", f"Estimated DPI {dpi} based on image width {width}") self.log.debug(f"Estimated DPI {dpi} based on image width {width}")
return dpi return dpi
except Exception as e: except Exception as e:
self.log("warning", f"Error while calculating DPI for image {image}: {e}") self.log.warning(f"Error while calculating DPI for image {image}: {e}")
return None return None
def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path): def extract_text(self, sidecar_file: Optional[Path], pdf_file: Path):
@ -127,10 +126,10 @@ class RasterisedDocumentParser(DocumentParser):
if "[OCR skipped on page" not in text: if "[OCR skipped on page" not in text:
# This happens when there's already text in the input file. # This happens when there's already text in the input file.
# The sidecar file will only contain text for OCR'ed pages. # The sidecar file will only contain text for OCR'ed pages.
self.log("debug", "Using text from sidecar file") self.log.debug("Using text from sidecar file")
return post_process_text(text) return post_process_text(text)
else: else:
self.log("debug", "Incomplete sidecar file: discarding.") self.log.debug("Incomplete sidecar file: discarding.")
# no success with the sidecar file, try PDF # no success with the sidecar file, try PDF
@ -160,8 +159,7 @@ class RasterisedDocumentParser(DocumentParser):
except Exception: except Exception:
# If pdftotext fails, fall back to OCR. # If pdftotext fails, fall back to OCR.
self.log( self.log.warning(
"warning",
"Error while getting text from PDF document with pdftotext", "Error while getting text from PDF document with pdftotext",
exc_info=True, exc_info=True,
) )
@ -227,15 +225,14 @@ class RasterisedDocumentParser(DocumentParser):
a4_dpi = self.calculate_a4_dpi(input_file) a4_dpi = self.calculate_a4_dpi(input_file)
if self.has_alpha(input_file): if self.has_alpha(input_file):
self.log( self.log.info(
"info",
f"Removing alpha layer from {input_file} " f"Removing alpha layer from {input_file} "
"for compatibility with img2pdf", "for compatibility with img2pdf",
) )
self.remove_alpha(input_file) self.remove_alpha(input_file)
if dpi: if dpi:
self.log("debug", f"Detected DPI for image {input_file}: {dpi}") self.log.debug(f"Detected DPI for image {input_file}: {dpi}")
ocrmypdf_args["image_dpi"] = dpi ocrmypdf_args["image_dpi"] = dpi
elif settings.OCR_IMAGE_DPI: elif settings.OCR_IMAGE_DPI:
ocrmypdf_args["image_dpi"] = settings.OCR_IMAGE_DPI ocrmypdf_args["image_dpi"] = settings.OCR_IMAGE_DPI
@ -253,8 +250,7 @@ class RasterisedDocumentParser(DocumentParser):
user_args = json.loads(settings.OCR_USER_ARGS) user_args = json.loads(settings.OCR_USER_ARGS)
ocrmypdf_args = {**ocrmypdf_args, **user_args} ocrmypdf_args = {**ocrmypdf_args, **user_args}
except Exception as e: except Exception as e:
self.log( self.log.warning(
"warning",
f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " f"There is an issue with PAPERLESS_OCR_USER_ARGS, so "
f"they will not be used. Error: {e}", f"they will not be used. Error: {e}",
) )
@ -263,15 +259,13 @@ class RasterisedDocumentParser(DocumentParser):
# Convert pixels to mega-pixels and provide to ocrmypdf # Convert pixels to mega-pixels and provide to ocrmypdf
max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0 max_pixels_mpixels = settings.OCR_MAX_IMAGE_PIXELS / 1_000_000.0
if max_pixels_mpixels > 0: if max_pixels_mpixels > 0:
self.log( self.log.debug(
"debug",
f"Calculated {max_pixels_mpixels} megapixels for OCR", f"Calculated {max_pixels_mpixels} megapixels for OCR",
) )
ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels ocrmypdf_args["max_image_mpixels"] = max_pixels_mpixels
else: else:
self.log( self.log.warning(
"warning",
"There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, " "There is an issue with PAPERLESS_OCR_MAX_IMAGE_PIXELS, "
"this value must be at least 1 megapixel if set", "this value must be at least 1 megapixel if set",
) )
@ -299,7 +293,7 @@ class RasterisedDocumentParser(DocumentParser):
or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"] or settings.OCR_SKIP_ARCHIVE_FILE in ["with_text", "always"]
) )
if skip_archive_for_text and original_has_text: if skip_archive_for_text and original_has_text:
self.log("debug", "Document has text, skipping OCRmyPDF entirely.") self.log.debug("Document has text, skipping OCRmyPDF entirely.")
self.text = text_original self.text = text_original
return return
@ -322,7 +316,7 @@ class RasterisedDocumentParser(DocumentParser):
) )
try: try:
self.log("debug", f"Calling OCRmyPDF with args: {args}") self.log.debug(f"Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args) ocrmypdf.ocr(**args)
if settings.OCR_SKIP_ARCHIVE_FILE != "always": if settings.OCR_SKIP_ARCHIVE_FILE != "always":
@ -333,16 +327,14 @@ class RasterisedDocumentParser(DocumentParser):
if not self.text: if not self.text:
raise NoTextFoundException("No text was found in the original document") raise NoTextFoundException("No text was found in the original document")
except EncryptedPdfError: except EncryptedPdfError:
self.log( self.log.warning(
"warning",
"This file is encrypted, OCR is impossible. Using " "This file is encrypted, OCR is impossible. Using "
"any text present in the original file.", "any text present in the original file.",
) )
if original_has_text: if original_has_text:
self.text = text_original self.text = text_original
except (NoTextFoundException, InputFileError) as e: except (NoTextFoundException, InputFileError) as e:
self.log( self.log.warning(
"warning",
f"Encountered an error while running OCR: {str(e)}. " f"Encountered an error while running OCR: {str(e)}. "
f"Attempting force OCR to get the text.", f"Attempting force OCR to get the text.",
) )
@ -365,7 +357,7 @@ class RasterisedDocumentParser(DocumentParser):
) )
try: try:
self.log("debug", f"Fallback: Calling OCRmyPDF with args: {args}") self.log.debug(f"Fallback: Calling OCRmyPDF with args: {args}")
ocrmypdf.ocr(**args) ocrmypdf.ocr(**args)
# Don't return the archived file here, since this file # Don't return the archived file here, since this file
@ -390,8 +382,7 @@ class RasterisedDocumentParser(DocumentParser):
if original_has_text: if original_has_text:
self.text = text_original self.text = text_original
else: else:
self.log( self.log.warning(
"warning",
f"No text was found in {document_path}, the content will " f"No text was found in {document_path}, the content will "
f"be empty.", f"be empty.",
) )

View File

@ -38,8 +38,7 @@ class TikaDocumentParser(DocumentParser):
try: try:
parsed = parser.from_file(document_path, tika_server) parsed = parser.from_file(document_path, tika_server)
except Exception as e: except Exception as e:
self.log( self.log.warning(
"warning",
f"Error while fetching document metadata for {document_path}: {e}", f"Error while fetching document metadata for {document_path}: {e}",
) )
return [] return []
@ -55,7 +54,7 @@ class TikaDocumentParser(DocumentParser):
] ]
def parse(self, document_path: Path, mime_type, file_name=None): def parse(self, document_path: Path, mime_type, file_name=None):
self.log("info", f"Sending {document_path} to Tika server") self.log.info(f"Sending {document_path} to Tika server")
tika_server = settings.TIKA_ENDPOINT tika_server = settings.TIKA_ENDPOINT
# tika does not support a PathLike, only strings # tika does not support a PathLike, only strings
@ -75,8 +74,7 @@ class TikaDocumentParser(DocumentParser):
try: try:
self.date = dateutil.parser.isoparse(parsed["metadata"]["Creation-Date"]) self.date = dateutil.parser.isoparse(parsed["metadata"]["Creation-Date"])
except Exception as e: except Exception as e:
self.log( self.log.warning(
"warning",
f"Unable to extract date for document {document_path}: {e}", f"Unable to extract date for document {document_path}: {e}",
) )
@ -87,7 +85,7 @@ class TikaDocumentParser(DocumentParser):
gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT gotenberg_server = settings.TIKA_GOTENBERG_ENDPOINT
url = gotenberg_server + "/forms/libreoffice/convert" url = gotenberg_server + "/forms/libreoffice/convert"
self.log("info", f"Converting {document_path} to PDF as {pdf_path}") self.log.info(f"Converting {document_path} to PDF as {pdf_path}")
with open(document_path, "rb") as document_handle: with open(document_path, "rb") as document_handle:
files = { files = {
"files": ( "files": (