diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py index 92fb90bb1..b8cf12980 100644 --- a/src/paperless_mail/parsers.py +++ b/src/paperless_mail/parsers.py @@ -69,6 +69,11 @@ class MailDocumentParser(DocumentParser): for key, value in mail.headers.items(): value = ", ".join(i for i in value) + try: + value.encode("utf-8") + except UnicodeEncodeError as e: # pragma: no cover + self.log.debug(f"Skipping header {key}: {e}") + continue result.append( { diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index b6baa3289..09086585e 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -55,11 +55,21 @@ class RasterisedDocumentParser(DocumentParser): value = str(value) try: m = namespace_pattern.match(key) + if m is None: # pragma: no cover + continue + namespace = m.group(1) + key_value = m.group(2) + try: + namespace.encode("utf-8") + key_value.encode("utf-8") + except UnicodeEncodeError as e: # pragma: no cover + self.log.debug(f"Skipping metadata key {key}: {e}") + continue result.append( { - "namespace": m.group(1), - "prefix": meta.REVERSE_NS[m.group(1)], - "key": m.group(2), + "namespace": namespace, + "prefix": meta.REVERSE_NS[namespace], + "key": key_value, "value": value, }, )