remove .eml parser from tika

This commit is contained in:
phail 2022-05-03 18:02:08 +02:00
parent 990e905a04
commit 5a899664f8
9 changed files with 33 additions and 1174 deletions

View File

@ -1,27 +0,0 @@
# Generated by Django 4.0.4 on 2022-04-19 18:13
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("paperless_mail", "0010_mailrule_consumption_scope"),
]
operations = [
migrations.AlterField(
model_name="mailrule",
name="action",
field=models.PositiveIntegerField(
choices=[
(1, "Delete"),
(2, "Move to specified folder"),
(3, "Mark as read, don't process read mails"),
(4, "Flag the mail, don't process flagged mails"),
],
default=3,
verbose_name="action",
),
),
]

View File

@ -1,4 +1,4 @@
# Generated by Django 4.0.4 on 2022-04-14 22:36
# Generated by Django 4.0.4 on 2022-05-03 15:58
from django.db import migrations, models
@ -6,7 +6,7 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("paperless_mail", "0009_alter_mailrule_action_alter_mailrule_folder"),
("paperless_mail", "0014_alter_mailrule_action"),
]
operations = [

View File

@ -1,13 +0,0 @@
# Generated by Django 4.0.4 on 2022-04-29 21:56
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("paperless_mail", "0011_alter_mailrule_action"),
("paperless_mail", "0014_alter_mailrule_action"),
]
operations = []

View File

@ -10,6 +10,7 @@ from documents.parsers import DocumentParser
from documents.parsers import make_thumbnail_from_pdf
from documents.parsers import ParseError
from imap_tools import MailMessage
from tika import parser
class MailDocumentParser(DocumentParser):
@ -117,6 +118,36 @@ class MailDocumentParser(DocumentParser):
self.date = mail.date
self.archive_path = self.generate_pdf(document_path)
def tika_parse(self, document_path):
self.log("info", f"Sending {document_path} to Tika server")
tika_server = settings.PAPERLESS_TIKA_ENDPOINT
try:
parsed = parser.from_file(document_path, tika_server)
except Exception as err:
raise ParseError(
f"Could not parse {document_path} with tika server at "
f"{tika_server}: {err}",
)
subject = parsed["metadata"].get("dc:subject", "<no subject>")
content = parsed["content"].strip()
if content.startswith(subject):
content = content[len(subject) :].strip()
content = re.sub(" +", " ", content)
content = re.sub("\n+", "\n", content)
text = (
f"{content}\n\n"
f"From: {parsed['metadata'].get('Message-From', '')}\n"
f"To: {parsed['metadata'].get('Message-To', '')}\n"
f"CC: {parsed['metadata'].get('Message-CC', '')}"
)
return text
def generate_pdf(self, document_path):
def clean_html(text: str):
if isinstance(text, list):

View File

@ -1,7 +1,6 @@
from django.apps import AppConfig
from django.conf import settings
from paperless_tika.signals import tika_consumer_declaration
from paperless_tika.signals import tika_consumer_declaration_eml
class PaperlessTikaConfig(AppConfig):
@ -12,5 +11,4 @@ class PaperlessTikaConfig(AppConfig):
if settings.PAPERLESS_TIKA_ENABLED:
document_consumer_declaration.connect(tika_consumer_declaration)
document_consumer_declaration.connect(tika_consumer_declaration_eml)
AppConfig.ready(self)

View File

@ -1,42 +0,0 @@
<!doctype html>
<html>
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link href="output.css" rel="stylesheet">
</head>
<body class="bg-white w-screen flex flex-col items-center">
<div class="container max-w-4xl">
<!-- Header -->
<div class="grid grid-rows-5 grid-cols-12 gap-x-2 bg-slate-200 p-4">
<div class="col-start-11 col-span-2 row-start-1 text-right">{{ date }}</div>
<div class="col-start-1 row-start-1 text-slate-400 text-right">{{ from_label }}</div>
<div class="col-start-2 col-span-8 row-start 1">{{ from }}</div>
<div class="col-start-1 row-start-2 text-slate-400 text-right">{{ subject_label }}</div>
<div class=" col-start-2 col-span-10 row-start 2 font-bold">{{ subject }}</div>
<div class="col-start-1 row-start-3 text-slate-400 text-right">{{ to_label }}</div>
<div class="col-start-2 col-span-10 row-start 3">{{ to }}</div>
<div class="col-start-1 row-start-4 text-slate-400 text-right">{{ cc_label }}</div>
<div class="col-start-2 col-span-10 row-start 4">{{ cc }}</div>
<div class="col-start-1 row-start-5 text-slate-400 text-right">{{ bcc_label }}</div>
<div class="col-start-2 col-span-10 row-start 5">{{ bcc }}</div>
</div>
<!-- Separator-->
<div class="border-t border-solid border-b w-full h-[1px] box-content border-black mb-5 bg-slate-200"></div>
<!-- Content-->
<div class="w-full whitespace-pre-line">{{ content }}
</div>
</body>
</html>

View File

@ -1,860 +0,0 @@
/*
! tailwindcss v3.0.24 | MIT License | https://tailwindcss.com
*/
/*
1. Prevent padding and border from affecting element width. (https://github.com/mozdevs/cssremedy/issues/4)
2. Allow adding a border to an element by just adding a border-width. (https://github.com/tailwindcss/tailwindcss/pull/116)
*/
*,
::before,
::after {
box-sizing: border-box;
/* 1 */
border-width: 0;
/* 2 */
border-style: solid;
/* 2 */
border-color: #e5e7eb;
/* 2 */
}
::before,
::after {
--tw-content: '';
}
/*
1. Use a consistent sensible line-height in all browsers.
2. Prevent adjustments of font size after orientation changes in iOS.
3. Use a more readable tab size.
4. Use the user's configured `sans` font-family by default.
*/
html {
line-height: 1.5;
/* 1 */
-webkit-text-size-adjust: 100%;
/* 2 */
-moz-tab-size: 4;
/* 3 */
-o-tab-size: 4;
tab-size: 4;
/* 3 */
font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
/* 4 */
}
/*
1. Remove the margin in all browsers.
2. Inherit line-height from `html` so users can set them as a class directly on the `html` element.
*/
body {
margin: 0;
/* 1 */
line-height: inherit;
/* 2 */
}
/*
1. Add the correct height in Firefox.
2. Correct the inheritance of border color in Firefox. (https://bugzilla.mozilla.org/show_bug.cgi?id=190655)
3. Ensure horizontal rules are visible by default.
*/
hr {
height: 0;
/* 1 */
color: inherit;
/* 2 */
border-top-width: 1px;
/* 3 */
}
/*
Add the correct text decoration in Chrome, Edge, and Safari.
*/
abbr:where([title]) {
-webkit-text-decoration: underline dotted;
text-decoration: underline dotted;
}
/*
Remove the default font size and weight for headings.
*/
h1,
h2,
h3,
h4,
h5,
h6 {
font-size: inherit;
font-weight: inherit;
}
/*
Reset links to optimize for opt-in styling instead of opt-out.
*/
a {
color: inherit;
text-decoration: inherit;
}
/*
Add the correct font weight in Edge and Safari.
*/
b,
strong {
font-weight: bolder;
}
/*
1. Use the user's configured `mono` font family by default.
2. Correct the odd `em` font sizing in all browsers.
*/
code,
kbd,
samp,
pre {
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
/* 1 */
font-size: 1em;
/* 2 */
}
/*
Add the correct font size in all browsers.
*/
small {
font-size: 80%;
}
/*
Prevent `sub` and `sup` elements from affecting the line height in all browsers.
*/
sub,
sup {
font-size: 75%;
line-height: 0;
position: relative;
vertical-align: baseline;
}
sub {
bottom: -0.25em;
}
sup {
top: -0.5em;
}
/*
1. Remove text indentation from table contents in Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=999088, https://bugs.webkit.org/show_bug.cgi?id=201297)
2. Correct table border color inheritance in all Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=935729, https://bugs.webkit.org/show_bug.cgi?id=195016)
3. Remove gaps between table borders by default.
*/
table {
text-indent: 0;
/* 1 */
border-color: inherit;
/* 2 */
border-collapse: collapse;
/* 3 */
}
/*
1. Change the font styles in all browsers.
2. Remove the margin in Firefox and Safari.
3. Remove default padding in all browsers.
*/
button,
input,
optgroup,
select,
textarea {
font-family: inherit;
/* 1 */
font-size: 100%;
/* 1 */
line-height: inherit;
/* 1 */
color: inherit;
/* 1 */
margin: 0;
/* 2 */
padding: 0;
/* 3 */
}
/*
Remove the inheritance of text transform in Edge and Firefox.
*/
button,
select {
text-transform: none;
}
/*
1. Correct the inability to style clickable types in iOS and Safari.
2. Remove default button styles.
*/
button,
[type='button'],
[type='reset'],
[type='submit'] {
-webkit-appearance: button;
/* 1 */
background-color: transparent;
/* 2 */
background-image: none;
/* 2 */
}
/*
Use the modern Firefox focus style for all focusable elements.
*/
:-moz-focusring {
outline: auto;
}
/*
Remove the additional `:invalid` styles in Firefox. (https://github.com/mozilla/gecko-dev/blob/2f9eacd9d3d995c937b4251a5557d95d494c9be1/layout/style/res/forms.css#L728-L737)
*/
:-moz-ui-invalid {
box-shadow: none;
}
/*
Add the correct vertical alignment in Chrome and Firefox.
*/
progress {
vertical-align: baseline;
}
/*
Correct the cursor style of increment and decrement buttons in Safari.
*/
::-webkit-inner-spin-button,
::-webkit-outer-spin-button {
height: auto;
}
/*
1. Correct the odd appearance in Chrome and Safari.
2. Correct the outline style in Safari.
*/
[type='search'] {
-webkit-appearance: textfield;
/* 1 */
outline-offset: -2px;
/* 2 */
}
/*
Remove the inner padding in Chrome and Safari on macOS.
*/
::-webkit-search-decoration {
-webkit-appearance: none;
}
/*
1. Correct the inability to style clickable types in iOS and Safari.
2. Change font properties to `inherit` in Safari.
*/
::-webkit-file-upload-button {
-webkit-appearance: button;
/* 1 */
font: inherit;
/* 2 */
}
/*
Add the correct display in Chrome and Safari.
*/
summary {
display: list-item;
}
/*
Removes the default spacing and border for appropriate elements.
*/
blockquote,
dl,
dd,
h1,
h2,
h3,
h4,
h5,
h6,
hr,
figure,
p,
pre {
margin: 0;
}
fieldset {
margin: 0;
padding: 0;
}
legend {
padding: 0;
}
ol,
ul,
menu {
list-style: none;
margin: 0;
padding: 0;
}
/*
Prevent resizing textareas horizontally by default.
*/
textarea {
resize: vertical;
}
/*
1. Reset the default placeholder opacity in Firefox. (https://github.com/tailwindlabs/tailwindcss/issues/3300)
2. Set the default placeholder color to the user's configured gray 400 color.
*/
input::-moz-placeholder, textarea::-moz-placeholder {
opacity: 1;
/* 1 */
color: #9ca3af;
/* 2 */
}
input:-ms-input-placeholder, textarea:-ms-input-placeholder {
opacity: 1;
/* 1 */
color: #9ca3af;
/* 2 */
}
input::placeholder,
textarea::placeholder {
opacity: 1;
/* 1 */
color: #9ca3af;
/* 2 */
}
/*
Set the default cursor for buttons.
*/
button,
[role="button"] {
cursor: pointer;
}
/*
Make sure disabled buttons don't get the pointer cursor.
*/
:disabled {
cursor: default;
}
/*
1. Make replaced elements `display: block` by default. (https://github.com/mozdevs/cssremedy/issues/14)
2. Add `vertical-align: middle` to align replaced elements more sensibly by default. (https://github.com/jensimmons/cssremedy/issues/14#issuecomment-634934210)
This can trigger a poorly considered lint error in some tools but is included by design.
*/
img,
svg,
video,
canvas,
audio,
iframe,
embed,
object {
display: block;
/* 1 */
vertical-align: middle;
/* 2 */
}
/*
Constrain images and videos to the parent width and preserve their intrinsic aspect ratio. (https://github.com/mozdevs/cssremedy/issues/14)
*/
img,
video {
max-width: 100%;
height: auto;
}
/*
Ensure the default browser behavior of the `hidden` attribute.
*/
[hidden] {
display: none;
}
*, ::before, ::after {
--tw-translate-x: 0;
--tw-translate-y: 0;
--tw-rotate: 0;
--tw-skew-x: 0;
--tw-skew-y: 0;
--tw-scale-x: 1;
--tw-scale-y: 1;
--tw-pan-x: ;
--tw-pan-y: ;
--tw-pinch-zoom: ;
--tw-scroll-snap-strictness: proximity;
--tw-ordinal: ;
--tw-slashed-zero: ;
--tw-numeric-figure: ;
--tw-numeric-spacing: ;
--tw-numeric-fraction: ;
--tw-ring-inset: ;
--tw-ring-offset-width: 0px;
--tw-ring-offset-color: #fff;
--tw-ring-color: rgb(59 130 246 / 0.5);
--tw-ring-offset-shadow: 0 0 #0000;
--tw-ring-shadow: 0 0 #0000;
--tw-shadow: 0 0 #0000;
--tw-shadow-colored: 0 0 #0000;
--tw-blur: ;
--tw-brightness: ;
--tw-contrast: ;
--tw-grayscale: ;
--tw-hue-rotate: ;
--tw-invert: ;
--tw-saturate: ;
--tw-sepia: ;
--tw-drop-shadow: ;
--tw-backdrop-blur: ;
--tw-backdrop-brightness: ;
--tw-backdrop-contrast: ;
--tw-backdrop-grayscale: ;
--tw-backdrop-hue-rotate: ;
--tw-backdrop-invert: ;
--tw-backdrop-opacity: ;
--tw-backdrop-saturate: ;
--tw-backdrop-sepia: ;
}
.container {
width: 100%;
}
@media (min-width: 640px) {
.container {
max-width: 640px;
}
}
@media (min-width: 768px) {
.container {
max-width: 768px;
}
}
@media (min-width: 1024px) {
.container {
max-width: 1024px;
}
}
@media (min-width: 1280px) {
.container {
max-width: 1280px;
}
}
@media (min-width: 1536px) {
.container {
max-width: 1536px;
}
}
.col-span-12 {
grid-column: span 12 / span 12;
}
.col-span-10 {
grid-column: span 10 / span 10;
}
.col-span-2 {
grid-column: span 2 / span 2;
}
.col-span-1 {
grid-column: span 1 / span 1;
}
.col-span-9 {
grid-column: span 9 / span 9;
}
.col-span-8 {
grid-column: span 8 / span 8;
}
.col-span-3 {
grid-column: span 3 / span 3;
}
.col-start-3 {
grid-column-start: 3;
}
.col-start-1 {
grid-column-start: 1;
}
.col-start-2 {
grid-column-start: 2;
}
.col-start-12 {
grid-column-start: 12;
}
.col-start-11 {
grid-column-start: 11;
}
.col-start-10 {
grid-column-start: 10;
}
.row-start-1 {
grid-row-start: 1;
}
.row-start-2 {
grid-row-start: 2;
}
.row-start-3 {
grid-row-start: 3;
}
.row-start-4 {
grid-row-start: 4;
}
.row-start-5 {
grid-row-start: 5;
}
.mt-5 {
margin-top: 1.25rem;
}
.mb-5 {
margin-bottom: 1.25rem;
}
.mt-8 {
margin-top: 2rem;
}
.mb-8 {
margin-bottom: 2rem;
}
.mt-16 {
margin-top: 4rem;
}
.mb-16 {
margin-bottom: 4rem;
}
.mt-12 {
margin-top: 3rem;
}
.mb-12 {
margin-bottom: 3rem;
}
.mt-1 {
margin-top: 0.25rem;
}
.mt-11 {
margin-top: 2.75rem;
}
.mb-11 {
margin-bottom: 2.75rem;
}
.mt-3 {
margin-top: 0.75rem;
}
.mt-10 {
margin-top: 2.5rem;
}
.box-border {
box-sizing: border-box;
}
.box-content {
box-sizing: content-box;
}
.flex {
display: flex;
}
.grid {
display: grid;
}
.h-1 {
height: 0.25rem;
}
.h-\[4px\] {
height: 4px;
}
.h-\[8px\] {
height: 8px;
}
.h-\[30px\] {
height: 30px;
}
.h-\[2px\] {
height: 2px;
}
.h-\[1px\] {
height: 1px;
}
.w-screen {
width: 100vw;
}
.w-full {
width: 100%;
}
.max-w-lg {
max-width: 32rem;
}
.max-w-3xl {
max-width: 48rem;
}
.max-w-4xl {
max-width: 56rem;
}
.max-w-7xl {
max-width: 80rem;
}
.auto-cols-min {
grid-auto-columns: -webkit-min-content;
grid-auto-columns: min-content;
}
.auto-cols-fr {
grid-auto-columns: minmax();
}
.auto-cols-max {
grid-auto-columns: -webkit-max-content;
grid-auto-columns: max-content;
}
.grid-cols-5 {
grid-template-columns: repeat(5, minmax());
}
.grid-cols-7 {
grid-template-columns: repeat(7, minmax());
}
.grid-cols-12 {
grid-template-columns: repeat(12, minmax());
}
.grid-rows-4 {
grid-template-rows: repeat(4, minmax());
}
.grid-rows-5 {
grid-template-rows: repeat(5, minmax());
}
.flex-col {
flex-direction: column;
}
.items-center {
align-items: center;
}
.justify-center {
justify-content: center;
}
.gap-3 {
gap: 0.75rem;
}
.gap-2 {
gap: 0.5rem;
}
.gap-y-3 {
row-gap: 0.75rem;
}
.gap-x-3 {
-moz-column-gap: 0.75rem;
column-gap: 0.75rem;
}
.gap-x-2 {
-moz-column-gap: 0.5rem;
column-gap: 0.5rem;
}
.whitespace-pre-line {
white-space: pre-line;
}
.border {
border-width: 1px;
}
.border-t {
border-top-width: 1px;
}
.border-t-2 {
border-top-width: 2px;
}
.border-b-2 {
border-bottom-width: 2px;
}
.border-t-4 {
border-top-width: 4px;
}
.border-b-4 {
border-bottom-width: 4px;
}
.border-b {
border-bottom-width: 1px;
}
.border-solid {
border-style: solid;
}
.border-black {
--tw-border-opacity: 1;
border-color: rgb(0 0 0 / var(--tw-border-opacity));
}
.bg-red-600 {
--tw-bg-opacity: 1;
background-color: rgb(220 38 38 / var(--tw-bg-opacity));
}
.bg-white {
--tw-bg-opacity: 1;
background-color: rgb(255 255 255 / var(--tw-bg-opacity));
}
.bg-slate-300 {
--tw-bg-opacity: 1;
background-color: rgb(203 213 225 / var(--tw-bg-opacity));
}
.bg-slate-200 {
--tw-bg-opacity: 1;
background-color: rgb(226 232 240 / var(--tw-bg-opacity));
}
.p-3 {
padding: 0.75rem;
}
.p-4 {
padding: 1rem;
}
.text-right {
text-align: right;
}
.align-middle {
vertical-align: middle;
}
.text-3xl {
font-size: 1.875rem;
line-height: 2.25rem;
}
.font-bold {
font-weight: 700;
}
.text-slate-400 {
--tw-text-opacity: 1;
color: rgb(148 163 184 / var(--tw-text-opacity));
}
.text-blue-600 {
--tw-text-opacity: 1;
color: rgb(37 99 235 / var(--tw-text-opacity));
}
.underline {
-webkit-text-decoration-line: underline;
text-decoration-line: underline;
}

View File

@ -1,6 +1,4 @@
import os
import re
from io import StringIO
import dateutil.parser
import requests
@ -99,213 +97,3 @@ class TikaDocumentParser(DocumentParser):
file.close()
return pdf_path
class TikaDocumentParserEml(DocumentParser):
"""
This parser sends documents to a local tika server
"""
logging_name = "paperless.parsing.tikaeml"
_tika_parsed = None
def get_tika_result(self, document_path):
if not self._tika_parsed:
self.log("info", f"Sending {document_path} to Tika server")
tika_server = settings.PAPERLESS_TIKA_ENDPOINT
try:
self._tika_parsed = parser.from_file(
document_path,
tika_server,
)
except Exception as err:
raise ParseError(
f"Could not parse {document_path} with tika server at "
f"{tika_server}: {err}",
)
return self._tika_parsed
def get_thumbnail(self, document_path, mime_type, file_name=None):
if not self.archive_path:
self.archive_path = self.generate_pdf(document_path)
return make_thumbnail_from_pdf(
self.archive_path,
self.tempdir,
self.logging_group,
)
def extract_metadata(self, document_path, mime_type):
result = []
prefix_pattern = re.compile(r"(.*):(.*)")
try:
parsed = self.get_tika_result(document_path)
except ParseError as e:
self.log(
"warning",
f"Error while fetching document metadata for " f"{document_path}: {e}",
)
return result
for key, value in parsed["metadata"].items():
if isinstance(value, list):
value = ", ".join([str(e) for e in value])
value = str(value)
try:
m = prefix_pattern.match(key)
result.append(
{
"namespace": "",
"prefix": m.group(1),
"key": m.group(2),
"value": value,
},
)
except AttributeError:
result.append(
{
"namespace": "",
"prefix": "",
"key": key,
"value": value,
},
)
except Exception as e:
self.log(
"warning",
f"Error while reading metadata {key}: {value}. Error: " f"{e}",
)
result.sort(key=lambda item: (item["prefix"], item["key"]))
return result
def parse(self, document_path, mime_type, file_name=None):
parsed = self.get_tika_result(document_path)
subject = parsed["metadata"].get("dc:subject", "<no subject>")
content = parsed["content"].strip()
if content.startswith(subject):
content = content[len(subject) :].strip()
content = re.sub(" +", " ", content)
content = re.sub("\n+", "\n", content)
self.text = (
f"{content}\n\n"
f"From: {parsed['metadata'].get('Message-From', '')}\n"
f"To: {parsed['metadata'].get('Message-To', '')}\n"
f"CC: {parsed['metadata'].get('Message-CC', '')}"
)
try:
self.date = dateutil.parser.isoparse(
parsed["metadata"]["dcterms:created"],
)
except Exception as e:
self.log(
"warning",
f"Unable to extract date for document " f"{document_path}: {e}",
)
self.archive_path = self.generate_pdf(document_path)
def generate_pdf(self, document_path):
def clean_html(text: str):
if isinstance(text, list):
text = "\n".join([str(e) for e in text])
if type(text) != str:
text = str(text)
text = text.replace("&", "&amp;")
text = text.replace("<", "&lt;")
text = text.replace(">", "&gt;")
text = text.replace(" ", "&nbsp;")
text = text.replace("'", "&apos;")
text = text.replace('"', "&quot;")
return text
parsed = self.get_tika_result(document_path)
pdf_path = os.path.join(self.tempdir, "convert.pdf")
gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
url = gotenberg_server + "/forms/chromium/convert/html"
self.log("info", f"Converting {document_path} to PDF as {pdf_path}")
data = {}
data["subject"] = clean_html(parsed["metadata"].get("dc:subject", ""))
if data["subject"] != "":
data["subject_label"] = "Subject"
data["from"] = clean_html(parsed["metadata"].get("Message-From", ""))
if data["from"] != "":
data["from_label"] = "From"
data["to"] = clean_html(parsed["metadata"].get("Message-To", ""))
if data["to"] != "":
data["to_label"] = "To"
data["cc"] = clean_html(parsed["metadata"].get("Message-CC", ""))
if data["cc"] != "":
data["cc_label"] = "CC"
data["bcc"] = clean_html(parsed["metadata"].get("Message-BCC", ""))
if data["bcc"] != "":
data["bcc_label"] = "BCC"
data["date"] = clean_html(parsed["metadata"].get("dcterms:created", ""))
content = parsed.get("content", "").strip()
if content.startswith(data["subject"]):
content = content[len(data["subject"]) :].strip()
data["content"] = clean_html(content)
html_file = os.path.join(os.path.dirname(__file__), "mail_template/index.html")
css_file = os.path.join(os.path.dirname(__file__), "mail_template/output.css")
placeholder_pattern = re.compile(r"{{(.+)}}")
html = StringIO()
with open(html_file, "r") as html_template_handle:
with open(css_file, "rb") as css_handle:
for line in html_template_handle.readlines():
for placeholder in placeholder_pattern.findall(line):
line = re.sub(
"{{" + placeholder + "}}",
data.get(placeholder.strip(), ""),
line,
)
html.write(line)
html.seek(0)
files = {
"html": (
"index.html",
html,
),
"css": (
"output.css",
css_handle,
),
}
headers = {}
data = {
"marginTop": "0",
"marginBottom": "0",
"marginLeft": "0",
"marginRight": "0",
"paperWidth": "8.27",
"paperHeight": "11.7",
"scale": "1.0",
}
try:
response = requests.post(
url,
files=files,
headers=headers,
data=data,
)
response.raise_for_status() # ensure we notice bad responses
except Exception as err:
raise ParseError(f"Error while converting document to PDF: {err}")
with open(pdf_path, "wb") as file:
file.write(response.content)
file.close()
return pdf_path

View File

@ -22,19 +22,3 @@ def tika_consumer_declaration(sender, **kwargs):
"text/rtf": ".rtf",
},
}
def get_parser_eml(*args, **kwargs):
from .parsers import TikaDocumentParserEml
return TikaDocumentParserEml(*args, **kwargs)
def tika_consumer_declaration_eml(sender, **kwargs):
return {
"parser": get_parser_eml,
"weight": 10,
"mime_types": {
"message/rfc822": ".eml",
},
}