mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-08-07 19:08:32 -05:00
remove .eml parser from tika
This commit is contained in:
@@ -1,27 +0,0 @@
|
||||
# Generated by Django 4.0.4 on 2022-04-19 18:13
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("paperless_mail", "0010_mailrule_consumption_scope"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name="mailrule",
|
||||
name="action",
|
||||
field=models.PositiveIntegerField(
|
||||
choices=[
|
||||
(1, "Delete"),
|
||||
(2, "Move to specified folder"),
|
||||
(3, "Mark as read, don't process read mails"),
|
||||
(4, "Flag the mail, don't process flagged mails"),
|
||||
],
|
||||
default=3,
|
||||
verbose_name="action",
|
||||
),
|
||||
),
|
||||
]
|
@@ -1,4 +1,4 @@
|
||||
# Generated by Django 4.0.4 on 2022-04-14 22:36
|
||||
# Generated by Django 4.0.4 on 2022-05-03 15:58
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
@@ -6,7 +6,7 @@ from django.db import migrations, models
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("paperless_mail", "0009_alter_mailrule_action_alter_mailrule_folder"),
|
||||
("paperless_mail", "0014_alter_mailrule_action"),
|
||||
]
|
||||
|
||||
operations = [
|
@@ -1,13 +0,0 @@
|
||||
# Generated by Django 4.0.4 on 2022-04-29 21:56
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("paperless_mail", "0011_alter_mailrule_action"),
|
||||
("paperless_mail", "0014_alter_mailrule_action"),
|
||||
]
|
||||
|
||||
operations = []
|
@@ -10,6 +10,7 @@ from documents.parsers import DocumentParser
|
||||
from documents.parsers import make_thumbnail_from_pdf
|
||||
from documents.parsers import ParseError
|
||||
from imap_tools import MailMessage
|
||||
from tika import parser
|
||||
|
||||
|
||||
class MailDocumentParser(DocumentParser):
|
||||
@@ -117,6 +118,36 @@ class MailDocumentParser(DocumentParser):
|
||||
self.date = mail.date
|
||||
self.archive_path = self.generate_pdf(document_path)
|
||||
|
||||
def tika_parse(self, document_path):
|
||||
|
||||
self.log("info", f"Sending {document_path} to Tika server")
|
||||
tika_server = settings.PAPERLESS_TIKA_ENDPOINT
|
||||
|
||||
try:
|
||||
parsed = parser.from_file(document_path, tika_server)
|
||||
except Exception as err:
|
||||
raise ParseError(
|
||||
f"Could not parse {document_path} with tika server at "
|
||||
f"{tika_server}: {err}",
|
||||
)
|
||||
|
||||
subject = parsed["metadata"].get("dc:subject", "<no subject>")
|
||||
content = parsed["content"].strip()
|
||||
|
||||
if content.startswith(subject):
|
||||
content = content[len(subject) :].strip()
|
||||
|
||||
content = re.sub(" +", " ", content)
|
||||
content = re.sub("\n+", "\n", content)
|
||||
|
||||
text = (
|
||||
f"{content}\n\n"
|
||||
f"From: {parsed['metadata'].get('Message-From', '')}\n"
|
||||
f"To: {parsed['metadata'].get('Message-To', '')}\n"
|
||||
f"CC: {parsed['metadata'].get('Message-CC', '')}"
|
||||
)
|
||||
return text
|
||||
|
||||
def generate_pdf(self, document_path):
|
||||
def clean_html(text: str):
|
||||
if isinstance(text, list):
|
||||
|
Reference in New Issue
Block a user