mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-04-02 13:45:10 -05:00
remove .eml parser from tika
This commit is contained in:
parent
990e905a04
commit
5a899664f8
@ -1,27 +0,0 @@
|
||||
# Generated by Django 4.0.4 on 2022-04-19 18:13
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("paperless_mail", "0010_mailrule_consumption_scope"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="mailrule",
|
||||
name="action",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Delete"),
|
||||
(2, "Move to specified folder"),
|
||||
(3, "Mark as read, don't process read mails"),
|
||||
(4, "Flag the mail, don't process flagged mails"),
|
||||
],
|
||||
default=3,
|
||||
verbose_name="action",
|
||||
),
|
||||
),
|
||||
]
|
@ -1,4 +1,4 @@
|
||||
# Generated by Django 4.0.4 on 2022-04-14 22:36
|
||||
# Generated by Django 4.0.4 on 2022-05-03 15:58
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
@ -6,7 +6,7 @@ from django.db import migrations, models
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("paperless_mail", "0009_alter_mailrule_action_alter_mailrule_folder"),
|
||||
("paperless_mail", "0014_alter_mailrule_action"),
|
||||
]
|
||||
|
||||
operations = [
|
@ -1,13 +0,0 @@
|
||||
# Generated by Django 4.0.4 on 2022-04-29 21:56
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("paperless_mail", "0011_alter_mailrule_action"),
|
||||
("paperless_mail", "0014_alter_mailrule_action"),
|
||||
]
|
||||
|
||||
operations = []
|
@ -10,6 +10,7 @@ from documents.parsers import DocumentParser
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from documents.parsers import ParseError
|
||||
from imap_tools import MailMessage
|
||||
from tika import parser
|
||||
|
||||
|
||||
class MailDocumentParser(DocumentParser):
|
||||
@ -117,6 +118,36 @@ class MailDocumentParser(DocumentParser):
|
||||
self.date = mail.date
|
||||
self.archive_path = self.generate_pdf(document_path)
|
||||
|
||||
def tika_parse(self, document_path):
|
||||
|
||||
self.log("info", f"Sending {document_path} to Tika server")
|
||||
tika_server = settings.PAPERLESS_TIKA_ENDPOINT
|
||||
|
||||
try:
|
||||
parsed = parser.from_file(document_path, tika_server)
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Could not parse {document_path} with tika server at "
|
||||
f"{tika_server}: {err}",
|
||||
)
|
||||
|
||||
subject = parsed["metadata"].get("dc:subject", "<no subject>")
|
||||
content = parsed["content"].strip()
|
||||
|
||||
if content.startswith(subject):
|
||||
content = content[len(subject) :].strip()
|
||||
|
||||
content = re.sub(" +", " ", content)
|
||||
content = re.sub("\n+", "\n", content)
|
||||
|
||||
text = (
|
||||
f"{content}\n\n"
|
||||
f"From: {parsed['metadata'].get('Message-From', '')}\n"
|
||||
f"To: {parsed['metadata'].get('Message-To', '')}\n"
|
||||
f"CC: {parsed['metadata'].get('Message-CC', '')}"
|
||||
)
|
||||
return text
|
||||
|
||||
def generate_pdf(self, document_path):
|
||||
def clean_html(text: str):
|
||||
if isinstance(text, list):
|
||||
|
@ -1,7 +1,6 @@
|
||||
from django.apps import AppConfig
|
||||
from django.conf import settings
|
||||
from paperless_tika.signals import tika_consumer_declaration
|
||||
from paperless_tika.signals import tika_consumer_declaration_eml
|
||||
|
||||
|
||||
class PaperlessTikaConfig(AppConfig):
|
||||
@ -12,5 +11,4 @@ class PaperlessTikaConfig(AppConfig):
|
||||
|
||||
if settings.PAPERLESS_TIKA_ENABLED:
|
||||
document_consumer_declaration.connect(tika_consumer_declaration)
|
||||
document_consumer_declaration.connect(tika_consumer_declaration_eml)
|
||||
AppConfig.ready(self)
|
||||
|
@ -1,42 +0,0 @@
|
||||
<!doctype html>
|
||||
<html>
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<link href="output.css" rel="stylesheet">
|
||||
</head>
|
||||
|
||||
<body class="bg-white w-screen flex flex-col items-center">
|
||||
<div class="container max-w-4xl">
|
||||
<!-- Header -->
|
||||
<div class="grid grid-rows-5 grid-cols-12 gap-x-2 bg-slate-200 p-4">
|
||||
|
||||
<div class="col-start-11 col-span-2 row-start-1 text-right">{{ date }}</div>
|
||||
|
||||
<div class="col-start-1 row-start-1 text-slate-400 text-right">{{ from_label }}</div>
|
||||
<div class="col-start-2 col-span-8 row-start 1">{{ from }}</div>
|
||||
|
||||
<div class="col-start-1 row-start-2 text-slate-400 text-right">{{ subject_label }}</div>
|
||||
<div class=" col-start-2 col-span-10 row-start 2 font-bold">{{ subject }}</div>
|
||||
|
||||
<div class="col-start-1 row-start-3 text-slate-400 text-right">{{ to_label }}</div>
|
||||
<div class="col-start-2 col-span-10 row-start 3">{{ to }}</div>
|
||||
|
||||
<div class="col-start-1 row-start-4 text-slate-400 text-right">{{ cc_label }}</div>
|
||||
<div class="col-start-2 col-span-10 row-start 4">{{ cc }}</div>
|
||||
|
||||
<div class="col-start-1 row-start-5 text-slate-400 text-right">{{ bcc_label }}</div>
|
||||
<div class="col-start-2 col-span-10 row-start 5">{{ bcc }}</div>
|
||||
</div>
|
||||
|
||||
<!-- Separator-->
|
||||
<div class="border-t border-solid border-b w-full h-[1px] box-content border-black mb-5 bg-slate-200"></div>
|
||||
|
||||
<!-- Content-->
|
||||
<div class="w-full whitespace-pre-line">{{ content }}
|
||||
</div>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
@ -1,860 +0,0 @@
|
||||
/*
|
||||
! tailwindcss v3.0.24 | MIT License | https://tailwindcss.com
|
||||
*/
|
||||
|
||||
/*
|
||||
1. Prevent padding and border from affecting element width. (https://github.com/mozdevs/cssremedy/issues/4)
|
||||
2. Allow adding a border to an element by just adding a border-width. (https://github.com/tailwindcss/tailwindcss/pull/116)
|
||||
*/
|
||||
|
||||
*,
|
||||
::before,
|
||||
::after {
|
||||
box-sizing: border-box;
|
||||
/* 1 */
|
||||
border-width: 0;
|
||||
/* 2 */
|
||||
border-style: solid;
|
||||
/* 2 */
|
||||
border-color: #e5e7eb;
|
||||
/* 2 */
|
||||
}
|
||||
|
||||
::before,
|
||||
::after {
|
||||
--tw-content: '';
|
||||
}
|
||||
|
||||
/*
|
||||
1. Use a consistent sensible line-height in all browsers.
|
||||
2. Prevent adjustments of font size after orientation changes in iOS.
|
||||
3. Use a more readable tab size.
|
||||
4. Use the user's configured `sans` font-family by default.
|
||||
*/
|
||||
|
||||
html {
|
||||
line-height: 1.5;
|
||||
/* 1 */
|
||||
-webkit-text-size-adjust: 100%;
|
||||
/* 2 */
|
||||
-moz-tab-size: 4;
|
||||
/* 3 */
|
||||
-o-tab-size: 4;
|
||||
tab-size: 4;
|
||||
/* 3 */
|
||||
font-family: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
|
||||
/* 4 */
|
||||
}
|
||||
|
||||
/*
|
||||
1. Remove the margin in all browsers.
|
||||
2. Inherit line-height from `html` so users can set them as a class directly on the `html` element.
|
||||
*/
|
||||
|
||||
body {
|
||||
margin: 0;
|
||||
/* 1 */
|
||||
line-height: inherit;
|
||||
/* 2 */
|
||||
}
|
||||
|
||||
/*
|
||||
1. Add the correct height in Firefox.
|
||||
2. Correct the inheritance of border color in Firefox. (https://bugzilla.mozilla.org/show_bug.cgi?id=190655)
|
||||
3. Ensure horizontal rules are visible by default.
|
||||
*/
|
||||
|
||||
hr {
|
||||
height: 0;
|
||||
/* 1 */
|
||||
color: inherit;
|
||||
/* 2 */
|
||||
border-top-width: 1px;
|
||||
/* 3 */
|
||||
}
|
||||
|
||||
/*
|
||||
Add the correct text decoration in Chrome, Edge, and Safari.
|
||||
*/
|
||||
|
||||
abbr:where([title]) {
|
||||
-webkit-text-decoration: underline dotted;
|
||||
text-decoration: underline dotted;
|
||||
}
|
||||
|
||||
/*
|
||||
Remove the default font size and weight for headings.
|
||||
*/
|
||||
|
||||
h1,
|
||||
h2,
|
||||
h3,
|
||||
h4,
|
||||
h5,
|
||||
h6 {
|
||||
font-size: inherit;
|
||||
font-weight: inherit;
|
||||
}
|
||||
|
||||
/*
|
||||
Reset links to optimize for opt-in styling instead of opt-out.
|
||||
*/
|
||||
|
||||
a {
|
||||
color: inherit;
|
||||
text-decoration: inherit;
|
||||
}
|
||||
|
||||
/*
|
||||
Add the correct font weight in Edge and Safari.
|
||||
*/
|
||||
|
||||
b,
|
||||
strong {
|
||||
font-weight: bolder;
|
||||
}
|
||||
|
||||
/*
|
||||
1. Use the user's configured `mono` font family by default.
|
||||
2. Correct the odd `em` font sizing in all browsers.
|
||||
*/
|
||||
|
||||
code,
|
||||
kbd,
|
||||
samp,
|
||||
pre {
|
||||
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
|
||||
/* 1 */
|
||||
font-size: 1em;
|
||||
/* 2 */
|
||||
}
|
||||
|
||||
/*
|
||||
Add the correct font size in all browsers.
|
||||
*/
|
||||
|
||||
small {
|
||||
font-size: 80%;
|
||||
}
|
||||
|
||||
/*
|
||||
Prevent `sub` and `sup` elements from affecting the line height in all browsers.
|
||||
*/
|
||||
|
||||
sub,
|
||||
sup {
|
||||
font-size: 75%;
|
||||
line-height: 0;
|
||||
position: relative;
|
||||
vertical-align: baseline;
|
||||
}
|
||||
|
||||
sub {
|
||||
bottom: -0.25em;
|
||||
}
|
||||
|
||||
sup {
|
||||
top: -0.5em;
|
||||
}
|
||||
|
||||
/*
|
||||
1. Remove text indentation from table contents in Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=999088, https://bugs.webkit.org/show_bug.cgi?id=201297)
|
||||
2. Correct table border color inheritance in all Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=935729, https://bugs.webkit.org/show_bug.cgi?id=195016)
|
||||
3. Remove gaps between table borders by default.
|
||||
*/
|
||||
|
||||
table {
|
||||
text-indent: 0;
|
||||
/* 1 */
|
||||
border-color: inherit;
|
||||
/* 2 */
|
||||
border-collapse: collapse;
|
||||
/* 3 */
|
||||
}
|
||||
|
||||
/*
|
||||
1. Change the font styles in all browsers.
|
||||
2. Remove the margin in Firefox and Safari.
|
||||
3. Remove default padding in all browsers.
|
||||
*/
|
||||
|
||||
button,
|
||||
input,
|
||||
optgroup,
|
||||
select,
|
||||
textarea {
|
||||
font-family: inherit;
|
||||
/* 1 */
|
||||
font-size: 100%;
|
||||
/* 1 */
|
||||
line-height: inherit;
|
||||
/* 1 */
|
||||
color: inherit;
|
||||
/* 1 */
|
||||
margin: 0;
|
||||
/* 2 */
|
||||
padding: 0;
|
||||
/* 3 */
|
||||
}
|
||||
|
||||
/*
|
||||
Remove the inheritance of text transform in Edge and Firefox.
|
||||
*/
|
||||
|
||||
button,
|
||||
select {
|
||||
text-transform: none;
|
||||
}
|
||||
|
||||
/*
|
||||
1. Correct the inability to style clickable types in iOS and Safari.
|
||||
2. Remove default button styles.
|
||||
*/
|
||||
|
||||
button,
|
||||
[type='button'],
|
||||
[type='reset'],
|
||||
[type='submit'] {
|
||||
-webkit-appearance: button;
|
||||
/* 1 */
|
||||
background-color: transparent;
|
||||
/* 2 */
|
||||
background-image: none;
|
||||
/* 2 */
|
||||
}
|
||||
|
||||
/*
|
||||
Use the modern Firefox focus style for all focusable elements.
|
||||
*/
|
||||
|
||||
:-moz-focusring {
|
||||
outline: auto;
|
||||
}
|
||||
|
||||
/*
|
||||
Remove the additional `:invalid` styles in Firefox. (https://github.com/mozilla/gecko-dev/blob/2f9eacd9d3d995c937b4251a5557d95d494c9be1/layout/style/res/forms.css#L728-L737)
|
||||
*/
|
||||
|
||||
:-moz-ui-invalid {
|
||||
box-shadow: none;
|
||||
}
|
||||
|
||||
/*
|
||||
Add the correct vertical alignment in Chrome and Firefox.
|
||||
*/
|
||||
|
||||
progress {
|
||||
vertical-align: baseline;
|
||||
}
|
||||
|
||||
/*
|
||||
Correct the cursor style of increment and decrement buttons in Safari.
|
||||
*/
|
||||
|
||||
::-webkit-inner-spin-button,
|
||||
::-webkit-outer-spin-button {
|
||||
height: auto;
|
||||
}
|
||||
|
||||
/*
|
||||
1. Correct the odd appearance in Chrome and Safari.
|
||||
2. Correct the outline style in Safari.
|
||||
*/
|
||||
|
||||
[type='search'] {
|
||||
-webkit-appearance: textfield;
|
||||
/* 1 */
|
||||
outline-offset: -2px;
|
||||
/* 2 */
|
||||
}
|
||||
|
||||
/*
|
||||
Remove the inner padding in Chrome and Safari on macOS.
|
||||
*/
|
||||
|
||||
::-webkit-search-decoration {
|
||||
-webkit-appearance: none;
|
||||
}
|
||||
|
||||
/*
|
||||
1. Correct the inability to style clickable types in iOS and Safari.
|
||||
2. Change font properties to `inherit` in Safari.
|
||||
*/
|
||||
|
||||
::-webkit-file-upload-button {
|
||||
-webkit-appearance: button;
|
||||
/* 1 */
|
||||
font: inherit;
|
||||
/* 2 */
|
||||
}
|
||||
|
||||
/*
|
||||
Add the correct display in Chrome and Safari.
|
||||
*/
|
||||
|
||||
summary {
|
||||
display: list-item;
|
||||
}
|
||||
|
||||
/*
|
||||
Removes the default spacing and border for appropriate elements.
|
||||
*/
|
||||
|
||||
blockquote,
|
||||
dl,
|
||||
dd,
|
||||
h1,
|
||||
h2,
|
||||
h3,
|
||||
h4,
|
||||
h5,
|
||||
h6,
|
||||
hr,
|
||||
figure,
|
||||
p,
|
||||
pre {
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
fieldset {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
legend {
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
ol,
|
||||
ul,
|
||||
menu {
|
||||
list-style: none;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
/*
|
||||
Prevent resizing textareas horizontally by default.
|
||||
*/
|
||||
|
||||
textarea {
|
||||
resize: vertical;
|
||||
}
|
||||
|
||||
/*
|
||||
1. Reset the default placeholder opacity in Firefox. (https://github.com/tailwindlabs/tailwindcss/issues/3300)
|
||||
2. Set the default placeholder color to the user's configured gray 400 color.
|
||||
*/
|
||||
|
||||
input::-moz-placeholder, textarea::-moz-placeholder {
|
||||
opacity: 1;
|
||||
/* 1 */
|
||||
color: #9ca3af;
|
||||
/* 2 */
|
||||
}
|
||||
|
||||
input:-ms-input-placeholder, textarea:-ms-input-placeholder {
|
||||
opacity: 1;
|
||||
/* 1 */
|
||||
color: #9ca3af;
|
||||
/* 2 */
|
||||
}
|
||||
|
||||
input::placeholder,
|
||||
textarea::placeholder {
|
||||
opacity: 1;
|
||||
/* 1 */
|
||||
color: #9ca3af;
|
||||
/* 2 */
|
||||
}
|
||||
|
||||
/*
|
||||
Set the default cursor for buttons.
|
||||
*/
|
||||
|
||||
button,
|
||||
[role="button"] {
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
/*
|
||||
Make sure disabled buttons don't get the pointer cursor.
|
||||
*/
|
||||
|
||||
:disabled {
|
||||
cursor: default;
|
||||
}
|
||||
|
||||
/*
|
||||
1. Make replaced elements `display: block` by default. (https://github.com/mozdevs/cssremedy/issues/14)
|
||||
2. Add `vertical-align: middle` to align replaced elements more sensibly by default. (https://github.com/jensimmons/cssremedy/issues/14#issuecomment-634934210)
|
||||
This can trigger a poorly considered lint error in some tools but is included by design.
|
||||
*/
|
||||
|
||||
img,
|
||||
svg,
|
||||
video,
|
||||
canvas,
|
||||
audio,
|
||||
iframe,
|
||||
embed,
|
||||
object {
|
||||
display: block;
|
||||
/* 1 */
|
||||
vertical-align: middle;
|
||||
/* 2 */
|
||||
}
|
||||
|
||||
/*
|
||||
Constrain images and videos to the parent width and preserve their intrinsic aspect ratio. (https://github.com/mozdevs/cssremedy/issues/14)
|
||||
*/
|
||||
|
||||
img,
|
||||
video {
|
||||
max-width: 100%;
|
||||
height: auto;
|
||||
}
|
||||
|
||||
/*
|
||||
Ensure the default browser behavior of the `hidden` attribute.
|
||||
*/
|
||||
|
||||
[hidden] {
|
||||
display: none;
|
||||
}
|
||||
|
||||
*, ::before, ::after {
|
||||
--tw-translate-x: 0;
|
||||
--tw-translate-y: 0;
|
||||
--tw-rotate: 0;
|
||||
--tw-skew-x: 0;
|
||||
--tw-skew-y: 0;
|
||||
--tw-scale-x: 1;
|
||||
--tw-scale-y: 1;
|
||||
--tw-pan-x: ;
|
||||
--tw-pan-y: ;
|
||||
--tw-pinch-zoom: ;
|
||||
--tw-scroll-snap-strictness: proximity;
|
||||
--tw-ordinal: ;
|
||||
--tw-slashed-zero: ;
|
||||
--tw-numeric-figure: ;
|
||||
--tw-numeric-spacing: ;
|
||||
--tw-numeric-fraction: ;
|
||||
--tw-ring-inset: ;
|
||||
--tw-ring-offset-width: 0px;
|
||||
--tw-ring-offset-color: #fff;
|
||||
--tw-ring-color: rgb(59 130 246 / 0.5);
|
||||
--tw-ring-offset-shadow: 0 0 #0000;
|
||||
--tw-ring-shadow: 0 0 #0000;
|
||||
--tw-shadow: 0 0 #0000;
|
||||
--tw-shadow-colored: 0 0 #0000;
|
||||
--tw-blur: ;
|
||||
--tw-brightness: ;
|
||||
--tw-contrast: ;
|
||||
--tw-grayscale: ;
|
||||
--tw-hue-rotate: ;
|
||||
--tw-invert: ;
|
||||
--tw-saturate: ;
|
||||
--tw-sepia: ;
|
||||
--tw-drop-shadow: ;
|
||||
--tw-backdrop-blur: ;
|
||||
--tw-backdrop-brightness: ;
|
||||
--tw-backdrop-contrast: ;
|
||||
--tw-backdrop-grayscale: ;
|
||||
--tw-backdrop-hue-rotate: ;
|
||||
--tw-backdrop-invert: ;
|
||||
--tw-backdrop-opacity: ;
|
||||
--tw-backdrop-saturate: ;
|
||||
--tw-backdrop-sepia: ;
|
||||
}
|
||||
|
||||
.container {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
@media (min-width: 640px) {
|
||||
.container {
|
||||
max-width: 640px;
|
||||
}
|
||||
}
|
||||
|
||||
@media (min-width: 768px) {
|
||||
.container {
|
||||
max-width: 768px;
|
||||
}
|
||||
}
|
||||
|
||||
@media (min-width: 1024px) {
|
||||
.container {
|
||||
max-width: 1024px;
|
||||
}
|
||||
}
|
||||
|
||||
@media (min-width: 1280px) {
|
||||
.container {
|
||||
max-width: 1280px;
|
||||
}
|
||||
}
|
||||
|
||||
@media (min-width: 1536px) {
|
||||
.container {
|
||||
max-width: 1536px;
|
||||
}
|
||||
}
|
||||
|
||||
.col-span-12 {
|
||||
grid-column: span 12 / span 12;
|
||||
}
|
||||
|
||||
.col-span-10 {
|
||||
grid-column: span 10 / span 10;
|
||||
}
|
||||
|
||||
.col-span-2 {
|
||||
grid-column: span 2 / span 2;
|
||||
}
|
||||
|
||||
.col-span-1 {
|
||||
grid-column: span 1 / span 1;
|
||||
}
|
||||
|
||||
.col-span-9 {
|
||||
grid-column: span 9 / span 9;
|
||||
}
|
||||
|
||||
.col-span-8 {
|
||||
grid-column: span 8 / span 8;
|
||||
}
|
||||
|
||||
.col-span-3 {
|
||||
grid-column: span 3 / span 3;
|
||||
}
|
||||
|
||||
.col-start-3 {
|
||||
grid-column-start: 3;
|
||||
}
|
||||
|
||||
.col-start-1 {
|
||||
grid-column-start: 1;
|
||||
}
|
||||
|
||||
.col-start-2 {
|
||||
grid-column-start: 2;
|
||||
}
|
||||
|
||||
.col-start-12 {
|
||||
grid-column-start: 12;
|
||||
}
|
||||
|
||||
.col-start-11 {
|
||||
grid-column-start: 11;
|
||||
}
|
||||
|
||||
.col-start-10 {
|
||||
grid-column-start: 10;
|
||||
}
|
||||
|
||||
.row-start-1 {
|
||||
grid-row-start: 1;
|
||||
}
|
||||
|
||||
.row-start-2 {
|
||||
grid-row-start: 2;
|
||||
}
|
||||
|
||||
.row-start-3 {
|
||||
grid-row-start: 3;
|
||||
}
|
||||
|
||||
.row-start-4 {
|
||||
grid-row-start: 4;
|
||||
}
|
||||
|
||||
.row-start-5 {
|
||||
grid-row-start: 5;
|
||||
}
|
||||
|
||||
.mt-5 {
|
||||
margin-top: 1.25rem;
|
||||
}
|
||||
|
||||
.mb-5 {
|
||||
margin-bottom: 1.25rem;
|
||||
}
|
||||
|
||||
.mt-8 {
|
||||
margin-top: 2rem;
|
||||
}
|
||||
|
||||
.mb-8 {
|
||||
margin-bottom: 2rem;
|
||||
}
|
||||
|
||||
.mt-16 {
|
||||
margin-top: 4rem;
|
||||
}
|
||||
|
||||
.mb-16 {
|
||||
margin-bottom: 4rem;
|
||||
}
|
||||
|
||||
.mt-12 {
|
||||
margin-top: 3rem;
|
||||
}
|
||||
|
||||
.mb-12 {
|
||||
margin-bottom: 3rem;
|
||||
}
|
||||
|
||||
.mt-1 {
|
||||
margin-top: 0.25rem;
|
||||
}
|
||||
|
||||
.mt-11 {
|
||||
margin-top: 2.75rem;
|
||||
}
|
||||
|
||||
.mb-11 {
|
||||
margin-bottom: 2.75rem;
|
||||
}
|
||||
|
||||
.mt-3 {
|
||||
margin-top: 0.75rem;
|
||||
}
|
||||
|
||||
.mt-10 {
|
||||
margin-top: 2.5rem;
|
||||
}
|
||||
|
||||
.box-border {
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
.box-content {
|
||||
box-sizing: content-box;
|
||||
}
|
||||
|
||||
.flex {
|
||||
display: flex;
|
||||
}
|
||||
|
||||
.grid {
|
||||
display: grid;
|
||||
}
|
||||
|
||||
.h-1 {
|
||||
height: 0.25rem;
|
||||
}
|
||||
|
||||
.h-\[4px\] {
|
||||
height: 4px;
|
||||
}
|
||||
|
||||
.h-\[8px\] {
|
||||
height: 8px;
|
||||
}
|
||||
|
||||
.h-\[30px\] {
|
||||
height: 30px;
|
||||
}
|
||||
|
||||
.h-\[2px\] {
|
||||
height: 2px;
|
||||
}
|
||||
|
||||
.h-\[1px\] {
|
||||
height: 1px;
|
||||
}
|
||||
|
||||
.w-screen {
|
||||
width: 100vw;
|
||||
}
|
||||
|
||||
.w-full {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
.max-w-lg {
|
||||
max-width: 32rem;
|
||||
}
|
||||
|
||||
.max-w-3xl {
|
||||
max-width: 48rem;
|
||||
}
|
||||
|
||||
.max-w-4xl {
|
||||
max-width: 56rem;
|
||||
}
|
||||
|
||||
.max-w-7xl {
|
||||
max-width: 80rem;
|
||||
}
|
||||
|
||||
.auto-cols-min {
|
||||
grid-auto-columns: -webkit-min-content;
|
||||
grid-auto-columns: min-content;
|
||||
}
|
||||
|
||||
.auto-cols-fr {
|
||||
grid-auto-columns: minmax();
|
||||
}
|
||||
|
||||
.auto-cols-max {
|
||||
grid-auto-columns: -webkit-max-content;
|
||||
grid-auto-columns: max-content;
|
||||
}
|
||||
|
||||
.grid-cols-5 {
|
||||
grid-template-columns: repeat(5, minmax());
|
||||
}
|
||||
|
||||
.grid-cols-7 {
|
||||
grid-template-columns: repeat(7, minmax());
|
||||
}
|
||||
|
||||
.grid-cols-12 {
|
||||
grid-template-columns: repeat(12, minmax());
|
||||
}
|
||||
|
||||
.grid-rows-4 {
|
||||
grid-template-rows: repeat(4, minmax());
|
||||
}
|
||||
|
||||
.grid-rows-5 {
|
||||
grid-template-rows: repeat(5, minmax());
|
||||
}
|
||||
|
||||
.flex-col {
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.items-center {
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.justify-center {
|
||||
justify-content: center;
|
||||
}
|
||||
|
||||
.gap-3 {
|
||||
gap: 0.75rem;
|
||||
}
|
||||
|
||||
.gap-2 {
|
||||
gap: 0.5rem;
|
||||
}
|
||||
|
||||
.gap-y-3 {
|
||||
row-gap: 0.75rem;
|
||||
}
|
||||
|
||||
.gap-x-3 {
|
||||
-moz-column-gap: 0.75rem;
|
||||
column-gap: 0.75rem;
|
||||
}
|
||||
|
||||
.gap-x-2 {
|
||||
-moz-column-gap: 0.5rem;
|
||||
column-gap: 0.5rem;
|
||||
}
|
||||
|
||||
.whitespace-pre-line {
|
||||
white-space: pre-line;
|
||||
}
|
||||
|
||||
.border {
|
||||
border-width: 1px;
|
||||
}
|
||||
|
||||
.border-t {
|
||||
border-top-width: 1px;
|
||||
}
|
||||
|
||||
.border-t-2 {
|
||||
border-top-width: 2px;
|
||||
}
|
||||
|
||||
.border-b-2 {
|
||||
border-bottom-width: 2px;
|
||||
}
|
||||
|
||||
.border-t-4 {
|
||||
border-top-width: 4px;
|
||||
}
|
||||
|
||||
.border-b-4 {
|
||||
border-bottom-width: 4px;
|
||||
}
|
||||
|
||||
.border-b {
|
||||
border-bottom-width: 1px;
|
||||
}
|
||||
|
||||
.border-solid {
|
||||
border-style: solid;
|
||||
}
|
||||
|
||||
.border-black {
|
||||
--tw-border-opacity: 1;
|
||||
border-color: rgb(0 0 0 / var(--tw-border-opacity));
|
||||
}
|
||||
|
||||
.bg-red-600 {
|
||||
--tw-bg-opacity: 1;
|
||||
background-color: rgb(220 38 38 / var(--tw-bg-opacity));
|
||||
}
|
||||
|
||||
.bg-white {
|
||||
--tw-bg-opacity: 1;
|
||||
background-color: rgb(255 255 255 / var(--tw-bg-opacity));
|
||||
}
|
||||
|
||||
.bg-slate-300 {
|
||||
--tw-bg-opacity: 1;
|
||||
background-color: rgb(203 213 225 / var(--tw-bg-opacity));
|
||||
}
|
||||
|
||||
.bg-slate-200 {
|
||||
--tw-bg-opacity: 1;
|
||||
background-color: rgb(226 232 240 / var(--tw-bg-opacity));
|
||||
}
|
||||
|
||||
.p-3 {
|
||||
padding: 0.75rem;
|
||||
}
|
||||
|
||||
.p-4 {
|
||||
padding: 1rem;
|
||||
}
|
||||
|
||||
.text-right {
|
||||
text-align: right;
|
||||
}
|
||||
|
||||
.align-middle {
|
||||
vertical-align: middle;
|
||||
}
|
||||
|
||||
.text-3xl {
|
||||
font-size: 1.875rem;
|
||||
line-height: 2.25rem;
|
||||
}
|
||||
|
||||
.font-bold {
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.text-slate-400 {
|
||||
--tw-text-opacity: 1;
|
||||
color: rgb(148 163 184 / var(--tw-text-opacity));
|
||||
}
|
||||
|
||||
.text-blue-600 {
|
||||
--tw-text-opacity: 1;
|
||||
color: rgb(37 99 235 / var(--tw-text-opacity));
|
||||
}
|
||||
|
||||
.underline {
|
||||
-webkit-text-decoration-line: underline;
|
||||
text-decoration-line: underline;
|
||||
}
|
@ -1,6 +1,4 @@
|
||||
import os
|
||||
import re
|
||||
from io import StringIO
|
||||
|
||||
import dateutil.parser
|
||||
import requests
|
||||
@ -99,213 +97,3 @@ class TikaDocumentParser(DocumentParser):
|
||||
file.close()
|
||||
|
||||
return pdf_path
|
||||
|
||||
|
||||
class TikaDocumentParserEml(DocumentParser):
|
||||
"""
|
||||
This parser sends documents to a local tika server
|
||||
"""
|
||||
|
||||
logging_name = "paperless.parsing.tikaeml"
|
||||
_tika_parsed = None
|
||||
|
||||
def get_tika_result(self, document_path):
|
||||
if not self._tika_parsed:
|
||||
self.log("info", f"Sending {document_path} to Tika server")
|
||||
tika_server = settings.PAPERLESS_TIKA_ENDPOINT
|
||||
|
||||
try:
|
||||
self._tika_parsed = parser.from_file(
|
||||
document_path,
|
||||
tika_server,
|
||||
)
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Could not parse {document_path} with tika server at "
|
||||
f"{tika_server}: {err}",
|
||||
)
|
||||
|
||||
return self._tika_parsed
|
||||
|
||||
def get_thumbnail(self, document_path, mime_type, file_name=None):
|
||||
if not self.archive_path:
|
||||
self.archive_path = self.generate_pdf(document_path)
|
||||
|
||||
return make_thumbnail_from_pdf(
|
||||
self.archive_path,
|
||||
self.tempdir,
|
||||
self.logging_group,
|
||||
)
|
||||
|
||||
def extract_metadata(self, document_path, mime_type):
|
||||
result = []
|
||||
prefix_pattern = re.compile(r"(.*):(.*)")
|
||||
|
||||
try:
|
||||
parsed = self.get_tika_result(document_path)
|
||||
except ParseError as e:
|
||||
self.log(
|
||||
"warning",
|
||||
f"Error while fetching document metadata for " f"{document_path}: {e}",
|
||||
)
|
||||
return result
|
||||
|
||||
for key, value in parsed["metadata"].items():
|
||||
if isinstance(value, list):
|
||||
value = ", ".join([str(e) for e in value])
|
||||
value = str(value)
|
||||
try:
|
||||
m = prefix_pattern.match(key)
|
||||
result.append(
|
||||
{
|
||||
"namespace": "",
|
||||
"prefix": m.group(1),
|
||||
"key": m.group(2),
|
||||
"value": value,
|
||||
},
|
||||
)
|
||||
except AttributeError:
|
||||
result.append(
|
||||
{
|
||||
"namespace": "",
|
||||
"prefix": "",
|
||||
"key": key,
|
||||
"value": value,
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
self.log(
|
||||
"warning",
|
||||
f"Error while reading metadata {key}: {value}. Error: " f"{e}",
|
||||
)
|
||||
result.sort(key=lambda item: (item["prefix"], item["key"]))
|
||||
return result
|
||||
|
||||
def parse(self, document_path, mime_type, file_name=None):
|
||||
parsed = self.get_tika_result(document_path)
|
||||
|
||||
subject = parsed["metadata"].get("dc:subject", "<no subject>")
|
||||
content = parsed["content"].strip()
|
||||
|
||||
if content.startswith(subject):
|
||||
content = content[len(subject) :].strip()
|
||||
|
||||
content = re.sub(" +", " ", content)
|
||||
content = re.sub("\n+", "\n", content)
|
||||
|
||||
self.text = (
|
||||
f"{content}\n\n"
|
||||
f"From: {parsed['metadata'].get('Message-From', '')}\n"
|
||||
f"To: {parsed['metadata'].get('Message-To', '')}\n"
|
||||
f"CC: {parsed['metadata'].get('Message-CC', '')}"
|
||||
)
|
||||
|
||||
try:
|
||||
self.date = dateutil.parser.isoparse(
|
||||
parsed["metadata"]["dcterms:created"],
|
||||
)
|
||||
except Exception as e:
|
||||
self.log(
|
||||
"warning",
|
||||
f"Unable to extract date for document " f"{document_path}: {e}",
|
||||
)
|
||||
|
||||
self.archive_path = self.generate_pdf(document_path)
|
||||
|
||||
def generate_pdf(self, document_path):
|
||||
def clean_html(text: str):
|
||||
if isinstance(text, list):
|
||||
text = "\n".join([str(e) for e in text])
|
||||
if type(text) != str:
|
||||
text = str(text)
|
||||
text = text.replace("&", "&")
|
||||
text = text.replace("<", "<")
|
||||
text = text.replace(">", ">")
|
||||
text = text.replace(" ", " ")
|
||||
text = text.replace("'", "'")
|
||||
text = text.replace('"', """)
|
||||
return text
|
||||
|
||||
parsed = self.get_tika_result(document_path)
|
||||
|
||||
pdf_path = os.path.join(self.tempdir, "convert.pdf")
|
||||
gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT
|
||||
url = gotenberg_server + "/forms/chromium/convert/html"
|
||||
|
||||
self.log("info", f"Converting {document_path} to PDF as {pdf_path}")
|
||||
|
||||
data = {}
|
||||
data["subject"] = clean_html(parsed["metadata"].get("dc:subject", ""))
|
||||
if data["subject"] != "":
|
||||
data["subject_label"] = "Subject"
|
||||
data["from"] = clean_html(parsed["metadata"].get("Message-From", ""))
|
||||
if data["from"] != "":
|
||||
data["from_label"] = "From"
|
||||
data["to"] = clean_html(parsed["metadata"].get("Message-To", ""))
|
||||
if data["to"] != "":
|
||||
data["to_label"] = "To"
|
||||
data["cc"] = clean_html(parsed["metadata"].get("Message-CC", ""))
|
||||
if data["cc"] != "":
|
||||
data["cc_label"] = "CC"
|
||||
data["bcc"] = clean_html(parsed["metadata"].get("Message-BCC", ""))
|
||||
if data["bcc"] != "":
|
||||
data["bcc_label"] = "BCC"
|
||||
data["date"] = clean_html(parsed["metadata"].get("dcterms:created", ""))
|
||||
|
||||
content = parsed.get("content", "").strip()
|
||||
if content.startswith(data["subject"]):
|
||||
content = content[len(data["subject"]) :].strip()
|
||||
data["content"] = clean_html(content)
|
||||
|
||||
html_file = os.path.join(os.path.dirname(__file__), "mail_template/index.html")
|
||||
css_file = os.path.join(os.path.dirname(__file__), "mail_template/output.css")
|
||||
placeholder_pattern = re.compile(r"{{(.+)}}")
|
||||
html = StringIO()
|
||||
|
||||
with open(html_file, "r") as html_template_handle:
|
||||
with open(css_file, "rb") as css_handle:
|
||||
for line in html_template_handle.readlines():
|
||||
for placeholder in placeholder_pattern.findall(line):
|
||||
line = re.sub(
|
||||
"{{" + placeholder + "}}",
|
||||
data.get(placeholder.strip(), ""),
|
||||
line,
|
||||
)
|
||||
html.write(line)
|
||||
html.seek(0)
|
||||
files = {
|
||||
"html": (
|
||||
"index.html",
|
||||
html,
|
||||
),
|
||||
"css": (
|
||||
"output.css",
|
||||
css_handle,
|
||||
),
|
||||
}
|
||||
headers = {}
|
||||
data = {
|
||||
"marginTop": "0",
|
||||
"marginBottom": "0",
|
||||
"marginLeft": "0",
|
||||
"marginRight": "0",
|
||||
"paperWidth": "8.27",
|
||||
"paperHeight": "11.7",
|
||||
"scale": "1.0",
|
||||
}
|
||||
try:
|
||||
response = requests.post(
|
||||
url,
|
||||
files=files,
|
||||
headers=headers,
|
||||
data=data,
|
||||
)
|
||||
response.raise_for_status() # ensure we notice bad responses
|
||||
except Exception as err:
|
||||
raise ParseError(f"Error while converting document to PDF: {err}")
|
||||
|
||||
with open(pdf_path, "wb") as file:
|
||||
file.write(response.content)
|
||||
file.close()
|
||||
|
||||
return pdf_path
|
||||
|
@ -22,19 +22,3 @@ def tika_consumer_declaration(sender, **kwargs):
|
||||
"text/rtf": ".rtf",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def get_parser_eml(*args, **kwargs):
|
||||
from .parsers import TikaDocumentParserEml
|
||||
|
||||
return TikaDocumentParserEml(*args, **kwargs)
|
||||
|
||||
|
||||
def tika_consumer_declaration_eml(sender, **kwargs):
|
||||
return {
|
||||
"parser": get_parser_eml,
|
||||
"weight": 10,
|
||||
"mime_types": {
|
||||
"message/rfc822": ".eml",
|
||||
},
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user