diff --git a/src/paperless_mail/migrations/0011_alter_mailrule_action.py b/src/paperless_mail/migrations/0011_alter_mailrule_action.py deleted file mode 100644 index 4dbff1386..000000000 --- a/src/paperless_mail/migrations/0011_alter_mailrule_action.py +++ /dev/null @@ -1,27 +0,0 @@ -# Generated by Django 4.0.4 on 2022-04-19 18:13 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ("paperless_mail", "0010_mailrule_consumption_scope"), - ] - - operations = [ - migrations.AlterField( - model_name="mailrule", - name="action", - field=models.PositiveIntegerField( - choices=[ - (1, "Delete"), - (2, "Move to specified folder"), - (3, "Mark as read, don't process read mails"), - (4, "Flag the mail, don't process flagged mails"), - ], - default=3, - verbose_name="action", - ), - ), - ] diff --git a/src/paperless_mail/migrations/0010_mailrule_consumption_scope.py b/src/paperless_mail/migrations/0015_mailrule_consumption_scope.py similarity index 86% rename from src/paperless_mail/migrations/0010_mailrule_consumption_scope.py rename to src/paperless_mail/migrations/0015_mailrule_consumption_scope.py index 8569cd378..e9e799014 100644 --- a/src/paperless_mail/migrations/0010_mailrule_consumption_scope.py +++ b/src/paperless_mail/migrations/0015_mailrule_consumption_scope.py @@ -1,4 +1,4 @@ -# Generated by Django 4.0.4 on 2022-04-14 22:36 +# Generated by Django 4.0.4 on 2022-05-03 15:58 from django.db import migrations, models @@ -6,7 +6,7 @@ from django.db import migrations, models class Migration(migrations.Migration): dependencies = [ - ("paperless_mail", "0009_alter_mailrule_action_alter_mailrule_folder"), + ("paperless_mail", "0014_alter_mailrule_action"), ] operations = [ diff --git a/src/paperless_mail/migrations/0015_merge_20220429_2356.py b/src/paperless_mail/migrations/0015_merge_20220429_2356.py deleted file mode 100644 index e46ac87d4..000000000 --- a/src/paperless_mail/migrations/0015_merge_20220429_2356.py +++ /dev/null @@ -1,13 +0,0 @@ -# Generated by Django 4.0.4 on 2022-04-29 21:56 - -from django.db import migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ("paperless_mail", "0011_alter_mailrule_action"), - ("paperless_mail", "0014_alter_mailrule_action"), - ] - - operations = [] diff --git a/src/paperless_mail/parsers.py b/src/paperless_mail/parsers.py index c3ac323ae..8c04c25af 100644 --- a/src/paperless_mail/parsers.py +++ b/src/paperless_mail/parsers.py @@ -10,6 +10,7 @@ from documents.parsers import DocumentParser from documents.parsers import make_thumbnail_from_pdf from documents.parsers import ParseError from imap_tools import MailMessage +from tika import parser class MailDocumentParser(DocumentParser): @@ -117,6 +118,36 @@ class MailDocumentParser(DocumentParser): self.date = mail.date self.archive_path = self.generate_pdf(document_path) + def tika_parse(self, document_path): + + self.log("info", f"Sending {document_path} to Tika server") + tika_server = settings.PAPERLESS_TIKA_ENDPOINT + + try: + parsed = parser.from_file(document_path, tika_server) + except Exception as err: + raise ParseError( + f"Could not parse {document_path} with tika server at " + f"{tika_server}: {err}", + ) + + subject = parsed["metadata"].get("dc:subject", "") + content = parsed["content"].strip() + + if content.startswith(subject): + content = content[len(subject) :].strip() + + content = re.sub(" +", " ", content) + content = re.sub("\n+", "\n", content) + + text = ( + f"{content}\n\n" + f"From: {parsed['metadata'].get('Message-From', '')}\n" + f"To: {parsed['metadata'].get('Message-To', '')}\n" + f"CC: {parsed['metadata'].get('Message-CC', '')}" + ) + return text + def generate_pdf(self, document_path): def clean_html(text: str): if isinstance(text, list): diff --git a/src/paperless_tika/apps.py b/src/paperless_tika/apps.py index 791d234a0..5cab21427 100644 --- a/src/paperless_tika/apps.py +++ b/src/paperless_tika/apps.py @@ -1,7 +1,6 @@ from django.apps import AppConfig from django.conf import settings from paperless_tika.signals import tika_consumer_declaration -from paperless_tika.signals import tika_consumer_declaration_eml class PaperlessTikaConfig(AppConfig): @@ -12,5 +11,4 @@ class PaperlessTikaConfig(AppConfig): if settings.PAPERLESS_TIKA_ENABLED: document_consumer_declaration.connect(tika_consumer_declaration) - document_consumer_declaration.connect(tika_consumer_declaration_eml) AppConfig.ready(self) diff --git a/src/paperless_tika/mail_template/index.html b/src/paperless_tika/mail_template/index.html deleted file mode 100644 index 7a8740dd8..000000000 --- a/src/paperless_tika/mail_template/index.html +++ /dev/null @@ -1,42 +0,0 @@ - - - - - - - - - - -
- -
- -
{{ date }}
- -
{{ from_label }}
-
{{ from }}
- -
{{ subject_label }}
-
{{ subject }}
- -
{{ to_label }}
-
{{ to }}
- -
{{ cc_label }}
-
{{ cc }}
- -
{{ bcc_label }}
-
{{ bcc }}
-
- - -
- - -
{{ content }} -
- - - - diff --git a/src/paperless_tika/mail_template/output.css b/src/paperless_tika/mail_template/output.css deleted file mode 100644 index 8b05e953b..000000000 --- a/src/paperless_tika/mail_template/output.css +++ /dev/null @@ -1,860 +0,0 @@ -/* -! tailwindcss v3.0.24 | MIT License | https://tailwindcss.com -*/ - -/* -1. Prevent padding and border from affecting element width. (https://github.com/mozdevs/cssremedy/issues/4) -2. Allow adding a border to an element by just adding a border-width. (https://github.com/tailwindcss/tailwindcss/pull/116) -*/ - -*, -::before, -::after { - box-sizing: border-box; - /* 1 */ - border-width: 0; - /* 2 */ - border-style: solid; - /* 2 */ - border-color: #e5e7eb; - /* 2 */ -} - -::before, -::after { - --tw-content: ''; -} - -/* -1. Use a consistent sensible line-height in all browsers. -2. Prevent adjustments of font size after orientation changes in iOS. -3. Use a more readable tab size. -4. Use the user's configured `sans` font-family by default. -*/ - -html { - line-height: 1.5; - /* 1 */ - -webkit-text-size-adjust: 100%; - /* 2 */ - -moz-tab-size: 4; - /* 3 */ - -o-tab-size: 4; - tab-size: 4; - /* 3 */ - font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji"; - /* 4 */ -} - -/* -1. Remove the margin in all browsers. -2. Inherit line-height from `html` so users can set them as a class directly on the `html` element. -*/ - -body { - margin: 0; - /* 1 */ - line-height: inherit; - /* 2 */ -} - -/* -1. Add the correct height in Firefox. -2. Correct the inheritance of border color in Firefox. (https://bugzilla.mozilla.org/show_bug.cgi?id=190655) -3. Ensure horizontal rules are visible by default. -*/ - -hr { - height: 0; - /* 1 */ - color: inherit; - /* 2 */ - border-top-width: 1px; - /* 3 */ -} - -/* -Add the correct text decoration in Chrome, Edge, and Safari. -*/ - -abbr:where([title]) { - -webkit-text-decoration: underline dotted; - text-decoration: underline dotted; -} - -/* -Remove the default font size and weight for headings. -*/ - -h1, -h2, -h3, -h4, -h5, -h6 { - font-size: inherit; - font-weight: inherit; -} - -/* -Reset links to optimize for opt-in styling instead of opt-out. -*/ - -a { - color: inherit; - text-decoration: inherit; -} - -/* -Add the correct font weight in Edge and Safari. -*/ - -b, -strong { - font-weight: bolder; -} - -/* -1. Use the user's configured `mono` font family by default. -2. Correct the odd `em` font sizing in all browsers. -*/ - -code, -kbd, -samp, -pre { - font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; - /* 1 */ - font-size: 1em; - /* 2 */ -} - -/* -Add the correct font size in all browsers. -*/ - -small { - font-size: 80%; -} - -/* -Prevent `sub` and `sup` elements from affecting the line height in all browsers. -*/ - -sub, -sup { - font-size: 75%; - line-height: 0; - position: relative; - vertical-align: baseline; -} - -sub { - bottom: -0.25em; -} - -sup { - top: -0.5em; -} - -/* -1. Remove text indentation from table contents in Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=999088, https://bugs.webkit.org/show_bug.cgi?id=201297) -2. Correct table border color inheritance in all Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=935729, https://bugs.webkit.org/show_bug.cgi?id=195016) -3. Remove gaps between table borders by default. -*/ - -table { - text-indent: 0; - /* 1 */ - border-color: inherit; - /* 2 */ - border-collapse: collapse; - /* 3 */ -} - -/* -1. Change the font styles in all browsers. -2. Remove the margin in Firefox and Safari. -3. Remove default padding in all browsers. -*/ - -button, -input, -optgroup, -select, -textarea { - font-family: inherit; - /* 1 */ - font-size: 100%; - /* 1 */ - line-height: inherit; - /* 1 */ - color: inherit; - /* 1 */ - margin: 0; - /* 2 */ - padding: 0; - /* 3 */ -} - -/* -Remove the inheritance of text transform in Edge and Firefox. -*/ - -button, -select { - text-transform: none; -} - -/* -1. Correct the inability to style clickable types in iOS and Safari. -2. Remove default button styles. -*/ - -button, -[type='button'], -[type='reset'], -[type='submit'] { - -webkit-appearance: button; - /* 1 */ - background-color: transparent; - /* 2 */ - background-image: none; - /* 2 */ -} - -/* -Use the modern Firefox focus style for all focusable elements. -*/ - -:-moz-focusring { - outline: auto; -} - -/* -Remove the additional `:invalid` styles in Firefox. (https://github.com/mozilla/gecko-dev/blob/2f9eacd9d3d995c937b4251a5557d95d494c9be1/layout/style/res/forms.css#L728-L737) -*/ - -:-moz-ui-invalid { - box-shadow: none; -} - -/* -Add the correct vertical alignment in Chrome and Firefox. -*/ - -progress { - vertical-align: baseline; -} - -/* -Correct the cursor style of increment and decrement buttons in Safari. -*/ - -::-webkit-inner-spin-button, -::-webkit-outer-spin-button { - height: auto; -} - -/* -1. Correct the odd appearance in Chrome and Safari. -2. Correct the outline style in Safari. -*/ - -[type='search'] { - -webkit-appearance: textfield; - /* 1 */ - outline-offset: -2px; - /* 2 */ -} - -/* -Remove the inner padding in Chrome and Safari on macOS. -*/ - -::-webkit-search-decoration { - -webkit-appearance: none; -} - -/* -1. Correct the inability to style clickable types in iOS and Safari. -2. Change font properties to `inherit` in Safari. -*/ - -::-webkit-file-upload-button { - -webkit-appearance: button; - /* 1 */ - font: inherit; - /* 2 */ -} - -/* -Add the correct display in Chrome and Safari. -*/ - -summary { - display: list-item; -} - -/* -Removes the default spacing and border for appropriate elements. -*/ - -blockquote, -dl, -dd, -h1, -h2, -h3, -h4, -h5, -h6, -hr, -figure, -p, -pre { - margin: 0; -} - -fieldset { - margin: 0; - padding: 0; -} - -legend { - padding: 0; -} - -ol, -ul, -menu { - list-style: none; - margin: 0; - padding: 0; -} - -/* -Prevent resizing textareas horizontally by default. -*/ - -textarea { - resize: vertical; -} - -/* -1. Reset the default placeholder opacity in Firefox. (https://github.com/tailwindlabs/tailwindcss/issues/3300) -2. Set the default placeholder color to the user's configured gray 400 color. -*/ - -input::-moz-placeholder, textarea::-moz-placeholder { - opacity: 1; - /* 1 */ - color: #9ca3af; - /* 2 */ -} - -input:-ms-input-placeholder, textarea:-ms-input-placeholder { - opacity: 1; - /* 1 */ - color: #9ca3af; - /* 2 */ -} - -input::placeholder, -textarea::placeholder { - opacity: 1; - /* 1 */ - color: #9ca3af; - /* 2 */ -} - -/* -Set the default cursor for buttons. -*/ - -button, -[role="button"] { - cursor: pointer; -} - -/* -Make sure disabled buttons don't get the pointer cursor. -*/ - -:disabled { - cursor: default; -} - -/* -1. Make replaced elements `display: block` by default. (https://github.com/mozdevs/cssremedy/issues/14) -2. Add `vertical-align: middle` to align replaced elements more sensibly by default. (https://github.com/jensimmons/cssremedy/issues/14#issuecomment-634934210) - This can trigger a poorly considered lint error in some tools but is included by design. -*/ - -img, -svg, -video, -canvas, -audio, -iframe, -embed, -object { - display: block; - /* 1 */ - vertical-align: middle; - /* 2 */ -} - -/* -Constrain images and videos to the parent width and preserve their intrinsic aspect ratio. (https://github.com/mozdevs/cssremedy/issues/14) -*/ - -img, -video { - max-width: 100%; - height: auto; -} - -/* -Ensure the default browser behavior of the `hidden` attribute. -*/ - -[hidden] { - display: none; -} - -*, ::before, ::after { - --tw-translate-x: 0; - --tw-translate-y: 0; - --tw-rotate: 0; - --tw-skew-x: 0; - --tw-skew-y: 0; - --tw-scale-x: 1; - --tw-scale-y: 1; - --tw-pan-x: ; - --tw-pan-y: ; - --tw-pinch-zoom: ; - --tw-scroll-snap-strictness: proximity; - --tw-ordinal: ; - --tw-slashed-zero: ; - --tw-numeric-figure: ; - --tw-numeric-spacing: ; - --tw-numeric-fraction: ; - --tw-ring-inset: ; - --tw-ring-offset-width: 0px; - --tw-ring-offset-color: #fff; - --tw-ring-color: rgb(59 130 246 / 0.5); - --tw-ring-offset-shadow: 0 0 #0000; - --tw-ring-shadow: 0 0 #0000; - --tw-shadow: 0 0 #0000; - --tw-shadow-colored: 0 0 #0000; - --tw-blur: ; - --tw-brightness: ; - --tw-contrast: ; - --tw-grayscale: ; - --tw-hue-rotate: ; - --tw-invert: ; - --tw-saturate: ; - --tw-sepia: ; - --tw-drop-shadow: ; - --tw-backdrop-blur: ; - --tw-backdrop-brightness: ; - --tw-backdrop-contrast: ; - --tw-backdrop-grayscale: ; - --tw-backdrop-hue-rotate: ; - --tw-backdrop-invert: ; - --tw-backdrop-opacity: ; - --tw-backdrop-saturate: ; - --tw-backdrop-sepia: ; -} - -.container { - width: 100%; -} - -@media (min-width: 640px) { - .container { - max-width: 640px; - } -} - -@media (min-width: 768px) { - .container { - max-width: 768px; - } -} - -@media (min-width: 1024px) { - .container { - max-width: 1024px; - } -} - -@media (min-width: 1280px) { - .container { - max-width: 1280px; - } -} - -@media (min-width: 1536px) { - .container { - max-width: 1536px; - } -} - -.col-span-12 { - grid-column: span 12 / span 12; -} - -.col-span-10 { - grid-column: span 10 / span 10; -} - -.col-span-2 { - grid-column: span 2 / span 2; -} - -.col-span-1 { - grid-column: span 1 / span 1; -} - -.col-span-9 { - grid-column: span 9 / span 9; -} - -.col-span-8 { - grid-column: span 8 / span 8; -} - -.col-span-3 { - grid-column: span 3 / span 3; -} - -.col-start-3 { - grid-column-start: 3; -} - -.col-start-1 { - grid-column-start: 1; -} - -.col-start-2 { - grid-column-start: 2; -} - -.col-start-12 { - grid-column-start: 12; -} - -.col-start-11 { - grid-column-start: 11; -} - -.col-start-10 { - grid-column-start: 10; -} - -.row-start-1 { - grid-row-start: 1; -} - -.row-start-2 { - grid-row-start: 2; -} - -.row-start-3 { - grid-row-start: 3; -} - -.row-start-4 { - grid-row-start: 4; -} - -.row-start-5 { - grid-row-start: 5; -} - -.mt-5 { - margin-top: 1.25rem; -} - -.mb-5 { - margin-bottom: 1.25rem; -} - -.mt-8 { - margin-top: 2rem; -} - -.mb-8 { - margin-bottom: 2rem; -} - -.mt-16 { - margin-top: 4rem; -} - -.mb-16 { - margin-bottom: 4rem; -} - -.mt-12 { - margin-top: 3rem; -} - -.mb-12 { - margin-bottom: 3rem; -} - -.mt-1 { - margin-top: 0.25rem; -} - -.mt-11 { - margin-top: 2.75rem; -} - -.mb-11 { - margin-bottom: 2.75rem; -} - -.mt-3 { - margin-top: 0.75rem; -} - -.mt-10 { - margin-top: 2.5rem; -} - -.box-border { - box-sizing: border-box; -} - -.box-content { - box-sizing: content-box; -} - -.flex { - display: flex; -} - -.grid { - display: grid; -} - -.h-1 { - height: 0.25rem; -} - -.h-\[4px\] { - height: 4px; -} - -.h-\[8px\] { - height: 8px; -} - -.h-\[30px\] { - height: 30px; -} - -.h-\[2px\] { - height: 2px; -} - -.h-\[1px\] { - height: 1px; -} - -.w-screen { - width: 100vw; -} - -.w-full { - width: 100%; -} - -.max-w-lg { - max-width: 32rem; -} - -.max-w-3xl { - max-width: 48rem; -} - -.max-w-4xl { - max-width: 56rem; -} - -.max-w-7xl { - max-width: 80rem; -} - -.auto-cols-min { - grid-auto-columns: -webkit-min-content; - grid-auto-columns: min-content; -} - -.auto-cols-fr { - grid-auto-columns: minmax(); -} - -.auto-cols-max { - grid-auto-columns: -webkit-max-content; - grid-auto-columns: max-content; -} - -.grid-cols-5 { - grid-template-columns: repeat(5, minmax()); -} - -.grid-cols-7 { - grid-template-columns: repeat(7, minmax()); -} - -.grid-cols-12 { - grid-template-columns: repeat(12, minmax()); -} - -.grid-rows-4 { - grid-template-rows: repeat(4, minmax()); -} - -.grid-rows-5 { - grid-template-rows: repeat(5, minmax()); -} - -.flex-col { - flex-direction: column; -} - -.items-center { - align-items: center; -} - -.justify-center { - justify-content: center; -} - -.gap-3 { - gap: 0.75rem; -} - -.gap-2 { - gap: 0.5rem; -} - -.gap-y-3 { - row-gap: 0.75rem; -} - -.gap-x-3 { - -moz-column-gap: 0.75rem; - column-gap: 0.75rem; -} - -.gap-x-2 { - -moz-column-gap: 0.5rem; - column-gap: 0.5rem; -} - -.whitespace-pre-line { - white-space: pre-line; -} - -.border { - border-width: 1px; -} - -.border-t { - border-top-width: 1px; -} - -.border-t-2 { - border-top-width: 2px; -} - -.border-b-2 { - border-bottom-width: 2px; -} - -.border-t-4 { - border-top-width: 4px; -} - -.border-b-4 { - border-bottom-width: 4px; -} - -.border-b { - border-bottom-width: 1px; -} - -.border-solid { - border-style: solid; -} - -.border-black { - --tw-border-opacity: 1; - border-color: rgb(0 0 0 / var(--tw-border-opacity)); -} - -.bg-red-600 { - --tw-bg-opacity: 1; - background-color: rgb(220 38 38 / var(--tw-bg-opacity)); -} - -.bg-white { - --tw-bg-opacity: 1; - background-color: rgb(255 255 255 / var(--tw-bg-opacity)); -} - -.bg-slate-300 { - --tw-bg-opacity: 1; - background-color: rgb(203 213 225 / var(--tw-bg-opacity)); -} - -.bg-slate-200 { - --tw-bg-opacity: 1; - background-color: rgb(226 232 240 / var(--tw-bg-opacity)); -} - -.p-3 { - padding: 0.75rem; -} - -.p-4 { - padding: 1rem; -} - -.text-right { - text-align: right; -} - -.align-middle { - vertical-align: middle; -} - -.text-3xl { - font-size: 1.875rem; - line-height: 2.25rem; -} - -.font-bold { - font-weight: 700; -} - -.text-slate-400 { - --tw-text-opacity: 1; - color: rgb(148 163 184 / var(--tw-text-opacity)); -} - -.text-blue-600 { - --tw-text-opacity: 1; - color: rgb(37 99 235 / var(--tw-text-opacity)); -} - -.underline { - -webkit-text-decoration-line: underline; - text-decoration-line: underline; -} diff --git a/src/paperless_tika/parsers.py b/src/paperless_tika/parsers.py index bc6c7aef9..22218dfe7 100644 --- a/src/paperless_tika/parsers.py +++ b/src/paperless_tika/parsers.py @@ -1,6 +1,4 @@ import os -import re -from io import StringIO import dateutil.parser import requests @@ -99,213 +97,3 @@ class TikaDocumentParser(DocumentParser): file.close() return pdf_path - - -class TikaDocumentParserEml(DocumentParser): - """ - This parser sends documents to a local tika server - """ - - logging_name = "paperless.parsing.tikaeml" - _tika_parsed = None - - def get_tika_result(self, document_path): - if not self._tika_parsed: - self.log("info", f"Sending {document_path} to Tika server") - tika_server = settings.PAPERLESS_TIKA_ENDPOINT - - try: - self._tika_parsed = parser.from_file( - document_path, - tika_server, - ) - except Exception as err: - raise ParseError( - f"Could not parse {document_path} with tika server at " - f"{tika_server}: {err}", - ) - - return self._tika_parsed - - def get_thumbnail(self, document_path, mime_type, file_name=None): - if not self.archive_path: - self.archive_path = self.generate_pdf(document_path) - - return make_thumbnail_from_pdf( - self.archive_path, - self.tempdir, - self.logging_group, - ) - - def extract_metadata(self, document_path, mime_type): - result = [] - prefix_pattern = re.compile(r"(.*):(.*)") - - try: - parsed = self.get_tika_result(document_path) - except ParseError as e: - self.log( - "warning", - f"Error while fetching document metadata for " f"{document_path}: {e}", - ) - return result - - for key, value in parsed["metadata"].items(): - if isinstance(value, list): - value = ", ".join([str(e) for e in value]) - value = str(value) - try: - m = prefix_pattern.match(key) - result.append( - { - "namespace": "", - "prefix": m.group(1), - "key": m.group(2), - "value": value, - }, - ) - except AttributeError: - result.append( - { - "namespace": "", - "prefix": "", - "key": key, - "value": value, - }, - ) - except Exception as e: - self.log( - "warning", - f"Error while reading metadata {key}: {value}. Error: " f"{e}", - ) - result.sort(key=lambda item: (item["prefix"], item["key"])) - return result - - def parse(self, document_path, mime_type, file_name=None): - parsed = self.get_tika_result(document_path) - - subject = parsed["metadata"].get("dc:subject", "") - content = parsed["content"].strip() - - if content.startswith(subject): - content = content[len(subject) :].strip() - - content = re.sub(" +", " ", content) - content = re.sub("\n+", "\n", content) - - self.text = ( - f"{content}\n\n" - f"From: {parsed['metadata'].get('Message-From', '')}\n" - f"To: {parsed['metadata'].get('Message-To', '')}\n" - f"CC: {parsed['metadata'].get('Message-CC', '')}" - ) - - try: - self.date = dateutil.parser.isoparse( - parsed["metadata"]["dcterms:created"], - ) - except Exception as e: - self.log( - "warning", - f"Unable to extract date for document " f"{document_path}: {e}", - ) - - self.archive_path = self.generate_pdf(document_path) - - def generate_pdf(self, document_path): - def clean_html(text: str): - if isinstance(text, list): - text = "\n".join([str(e) for e in text]) - if type(text) != str: - text = str(text) - text = text.replace("&", "&") - text = text.replace("<", "<") - text = text.replace(">", ">") - text = text.replace(" ", " ") - text = text.replace("'", "'") - text = text.replace('"', """) - return text - - parsed = self.get_tika_result(document_path) - - pdf_path = os.path.join(self.tempdir, "convert.pdf") - gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT - url = gotenberg_server + "/forms/chromium/convert/html" - - self.log("info", f"Converting {document_path} to PDF as {pdf_path}") - - data = {} - data["subject"] = clean_html(parsed["metadata"].get("dc:subject", "")) - if data["subject"] != "": - data["subject_label"] = "Subject" - data["from"] = clean_html(parsed["metadata"].get("Message-From", "")) - if data["from"] != "": - data["from_label"] = "From" - data["to"] = clean_html(parsed["metadata"].get("Message-To", "")) - if data["to"] != "": - data["to_label"] = "To" - data["cc"] = clean_html(parsed["metadata"].get("Message-CC", "")) - if data["cc"] != "": - data["cc_label"] = "CC" - data["bcc"] = clean_html(parsed["metadata"].get("Message-BCC", "")) - if data["bcc"] != "": - data["bcc_label"] = "BCC" - data["date"] = clean_html(parsed["metadata"].get("dcterms:created", "")) - - content = parsed.get("content", "").strip() - if content.startswith(data["subject"]): - content = content[len(data["subject"]) :].strip() - data["content"] = clean_html(content) - - html_file = os.path.join(os.path.dirname(__file__), "mail_template/index.html") - css_file = os.path.join(os.path.dirname(__file__), "mail_template/output.css") - placeholder_pattern = re.compile(r"{{(.+)}}") - html = StringIO() - - with open(html_file, "r") as html_template_handle: - with open(css_file, "rb") as css_handle: - for line in html_template_handle.readlines(): - for placeholder in placeholder_pattern.findall(line): - line = re.sub( - "{{" + placeholder + "}}", - data.get(placeholder.strip(), ""), - line, - ) - html.write(line) - html.seek(0) - files = { - "html": ( - "index.html", - html, - ), - "css": ( - "output.css", - css_handle, - ), - } - headers = {} - data = { - "marginTop": "0", - "marginBottom": "0", - "marginLeft": "0", - "marginRight": "0", - "paperWidth": "8.27", - "paperHeight": "11.7", - "scale": "1.0", - } - try: - response = requests.post( - url, - files=files, - headers=headers, - data=data, - ) - response.raise_for_status() # ensure we notice bad responses - except Exception as err: - raise ParseError(f"Error while converting document to PDF: {err}") - - with open(pdf_path, "wb") as file: - file.write(response.content) - file.close() - - return pdf_path diff --git a/src/paperless_tika/signals.py b/src/paperless_tika/signals.py index a852cfdb2..39838f076 100644 --- a/src/paperless_tika/signals.py +++ b/src/paperless_tika/signals.py @@ -22,19 +22,3 @@ def tika_consumer_declaration(sender, **kwargs): "text/rtf": ".rtf", }, } - - -def get_parser_eml(*args, **kwargs): - from .parsers import TikaDocumentParserEml - - return TikaDocumentParserEml(*args, **kwargs) - - -def tika_consumer_declaration_eml(sender, **kwargs): - return { - "parser": get_parser_eml, - "weight": 10, - "mime_types": { - "message/rfc822": ".eml", - }, - }