remove .eml parser from tika

This commit is contained in:
phail
2022-05-03 18:02:08 +02:00
parent 990e905a04
commit 5a899664f8
9 changed files with 33 additions and 1174 deletions

View File

@@ -1,27 +0,0 @@
# Generated by Django 4.0.4 on 2022-04-19 18:13
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("paperless_mail", "0010_mailrule_consumption_scope"),
]
operations = [
migrations.AlterField(
model_name="mailrule",
name="action",
field=models.PositiveIntegerField(
choices=[
(1, "Delete"),
(2, "Move to specified folder"),
(3, "Mark as read, don't process read mails"),
(4, "Flag the mail, don't process flagged mails"),
],
default=3,
verbose_name="action",
),
),
]

View File

@@ -1,4 +1,4 @@
# Generated by Django 4.0.4 on 2022-04-14 22:36
# Generated by Django 4.0.4 on 2022-05-03 15:58
from django.db import migrations, models
@@ -6,7 +6,7 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("paperless_mail", "0009_alter_mailrule_action_alter_mailrule_folder"),
("paperless_mail", "0014_alter_mailrule_action"),
]
operations = [

View File

@@ -1,13 +0,0 @@
# Generated by Django 4.0.4 on 2022-04-29 21:56
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("paperless_mail", "0011_alter_mailrule_action"),
("paperless_mail", "0014_alter_mailrule_action"),
]
operations = []

View File

@@ -10,6 +10,7 @@ from documents.parsers import DocumentParser
from documents.parsers import make_thumbnail_from_pdf
from documents.parsers import ParseError
from imap_tools import MailMessage
from tika import parser
class MailDocumentParser(DocumentParser):
@@ -117,6 +118,36 @@ class MailDocumentParser(DocumentParser):
self.date = mail.date
self.archive_path = self.generate_pdf(document_path)
def tika_parse(self, document_path):
self.log("info", f"Sending {document_path} to Tika server")
tika_server = settings.PAPERLESS_TIKA_ENDPOINT
try:
parsed = parser.from_file(document_path, tika_server)
except Exception as err:
raise ParseError(
f"Could not parse {document_path} with tika server at "
f"{tika_server}: {err}",
)
subject = parsed["metadata"].get("dc:subject", "<no subject>")
content = parsed["content"].strip()
if content.startswith(subject):
content = content[len(subject) :].strip()
content = re.sub(" +", " ", content)
content = re.sub("\n+", "\n", content)
text = (
f"{content}\n\n"
f"From: {parsed['metadata'].get('Message-From', '')}\n"
f"To: {parsed['metadata'].get('Message-To', '')}\n"
f"CC: {parsed['metadata'].get('Message-CC', '')}"
)
return text
def generate_pdf(self, document_path):
def clean_html(text: str):
if isinstance(text, list):