Merge remote-tracking branch 'paperless/dev' into feature-consume-eml

This commit is contained in:
phail
2022-11-03 21:00:01 +01:00
45 changed files with 1426 additions and 971 deletions

View File

@@ -405,6 +405,7 @@ class Consumer(LoggingMixin):
# Don't save with the lock active. Saving will cause the file
# renaming logic to acquire the lock as well.
# This triggers things like file renaming
document.save()
# Delete the file only if it was successfully consumed
@@ -438,6 +439,9 @@ class Consumer(LoggingMixin):
self._send_progress(100, 100, "SUCCESS", MESSAGE_FINISHED, document.id)
# Return the most up to date fields
document.refresh_from_db()
return document
def _store(self, text, date, mime_type) -> Document:

View File

@@ -1,4 +1,3 @@
import datetime
import logging
import os
from collections import defaultdict
@@ -172,7 +171,7 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
else:
asn = "-none-"
# Convert UTC database date to localized date
# Convert UTC database datetime to localized date
local_added = timezone.localdate(doc.added)
local_created = timezone.localdate(doc.created)
@@ -180,14 +179,20 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
title=pathvalidate.sanitize_filename(doc.title, replacement_text="-"),
correspondent=correspondent,
document_type=document_type,
created=datetime.date.isoformat(local_created),
created_year=local_created.year,
created_month=f"{local_created.month:02}",
created_day=f"{local_created.day:02}",
added=datetime.date.isoformat(local_added),
added_year=local_added.year,
added_month=f"{local_added.month:02}",
added_day=f"{local_added.day:02}",
created=local_created.isoformat(),
created_year=local_created.strftime("%Y"),
created_year_short=local_created.strftime("%y"),
created_month=local_created.strftime("%m"),
created_month_name=local_created.strftime("%B"),
created_month_name_short=local_created.strftime("%b"),
created_day=local_created.strftime("%d"),
added=local_added.isoformat(),
added_year=local_added.strftime("%Y"),
added_year_short=local_added.strftime("%y"),
added_month=local_added.strftime("%m"),
added_month_name=local_added.strftime("%B"),
added_month_name_short=local_added.strftime("%b"),
added_day=local_added.strftime("%d"),
asn=asn,
tags=tags,
tag_list=tag_list,

View File

@@ -142,14 +142,14 @@ def matches(matching_model, document):
return bool(match)
elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
from fuzzywuzzy import fuzz
from rapidfuzz import fuzz
match = re.sub(r"[^\w\s]", "", matching_model.match)
text = re.sub(r"[^\w\s]", "", document_content)
if matching_model.is_insensitive:
match = match.lower()
text = text.lower()
if fuzz.partial_ratio(match, text) >= 90:
if fuzz.partial_ratio(match, text, score_cutoff=90):
# TODO: make this better
log_reason(
matching_model,

View File

@@ -400,6 +400,13 @@ def update_filename_and_move_files(sender, instance, **kwargs):
with FileLock(settings.MEDIA_LOCK):
try:
# If this was waiting for the lock, the filename or archive_filename
# of this document may have been updated. This happens if multiple updates
# get queued from the UI for the same document
# So freshen up the data before doing anything
instance.refresh_from_db()
old_filename = instance.filename
old_source_path = instance.source_path

View File

@@ -14,6 +14,7 @@ except ImportError:
import backports.zoneinfo as zoneinfo
from django.conf import settings
from django.utils import timezone
from django.test import override_settings
from django.test import TestCase
@@ -326,6 +327,12 @@ class TestConsumer(DirectoriesMixin, TestCase):
def testNormalOperation(self):
filename = self.get_test_file()
# Get the local time, as an aware datetime
# Roughly equal to file modification time
rough_create_date_local = timezone.localtime(timezone.now())
# Consume the file
document = self.consumer.try_consume_file(filename)
self.assertEqual(document.content, "The Text")
@@ -351,7 +358,20 @@ class TestConsumer(DirectoriesMixin, TestCase):
self._assert_first_last_send_progress()
self.assertEqual(document.created.tzinfo, zoneinfo.ZoneInfo("America/Chicago"))
# Convert UTC time from DB to local time
document_date_local = timezone.localtime(document.created)
self.assertEqual(
document_date_local.tzinfo,
zoneinfo.ZoneInfo("America/Chicago"),
)
self.assertEqual(document_date_local.tzinfo, rough_create_date_local.tzinfo)
self.assertEqual(document_date_local.year, rough_create_date_local.year)
self.assertEqual(document_date_local.month, rough_create_date_local.month)
self.assertEqual(document_date_local.day, rough_create_date_local.day)
self.assertEqual(document_date_local.hour, rough_create_date_local.hour)
self.assertEqual(document_date_local.minute, rough_create_date_local.minute)
# Skipping seconds and more precise
@override_settings(FILENAME_FORMAT=None)
def testDeleteMacFiles(self):

View File

@@ -1036,6 +1036,34 @@ class TestFilenameGeneration(TestCase):
self.assertEqual(generate_filename(doc_a), "0000002.pdf")
self.assertEqual(generate_filename(doc_b), "SomeImportantNone/2020-07-25.pdf")
@override_settings(
FILENAME_FORMAT="{created_year_short}/{created_month_name_short}/{created_month_name}/{title}",
)
def test_short_names_created(self):
doc = Document.objects.create(
title="The Title",
created=timezone.make_aware(
datetime.datetime(1989, 12, 21, 7, 36, 51, 153),
),
mime_type="application/pdf",
pk=2,
checksum="2",
)
self.assertEqual(generate_filename(doc), "89/Dec/December/The Title.pdf")
@override_settings(
FILENAME_FORMAT="{added_year_short}/{added_month_name}/{added_month_name_short}/{title}",
)
def test_short_names_added(self):
doc = Document.objects.create(
title="The Title",
added=timezone.make_aware(datetime.datetime(1984, 8, 21, 7, 36, 51, 153)),
mime_type="application/pdf",
pk=2,
checksum="2",
)
self.assertEqual(generate_filename(doc), "84/August/Aug/The Title.pdf")
def run():
doc = Document.objects.create(

View File

@@ -347,6 +347,13 @@ if os.getenv("PAPERLESS_DBHOST"):
if os.getenv("PAPERLESS_DBENGINE") == "mariadb":
engine = "django.db.backends.mysql"
options = {"read_default_file": "/etc/mysql/my.cnf", "charset": "utf8mb4"}
# Silence Django error on old MariaDB versions.
# VARCHAR can support > 255 in modern versions
# https://docs.djangoproject.com/en/4.1/ref/checks/#database
# https://mariadb.com/kb/en/innodb-system-variables/#innodb_large_prefix
SILENCED_SYSTEM_CHECKS = ["mysql.W003"]
else: # Default to PostgresDB
engine = "django.db.backends.postgresql_psycopg2"
options = {"sslmode": os.getenv("PAPERLESS_DBSSLMODE", "prefer")}

View File

@@ -1,10 +1,11 @@
import datetime
import os
import time
from pathlib import Path
from typing import Final
import pytest
from django.test import TestCase
from documents.parsers import ParseError
from paperless_tika.parsers import TikaDocumentParser
@@ -24,6 +25,44 @@ class TestTikaParserAgainstServer(TestCase):
def tearDown(self) -> None:
self.parser.cleanup()
def try_parse_with_wait(self, test_file, mime_type):
"""
For whatever reason, the image started during the test pipeline likes to
segfault sometimes, when run with the exact files that usually pass.
So, this function will retry the parsing up to 3 times, with larger backoff
periods between each attempt, in hopes the issue resolves itself during
one attempt to parse.
This will wait the following:
- Attempt 1 - 20s following failure
- Attempt 2 - 40s following failure
- Attempt 3 - 80s following failure
"""
succeeded = False
retry_time = 20.0
retry_count = 0
max_retry_count = 3
while retry_count < max_retry_count and not succeeded:
try:
self.parser.parse(test_file, mime_type)
succeeded = True
except Exception as e:
print(f"{e} during try #{retry_count}", flush=True)
retry_count = retry_count + 1
time.sleep(retry_time)
retry_time = retry_time * 2.0
self.assertTrue(
succeeded,
"Continued Tika server errors after multiple retries",
)
def test_basic_parse_odt(self):
"""
GIVEN:
@@ -36,7 +75,7 @@ class TestTikaParserAgainstServer(TestCase):
"""
test_file = self.SAMPLE_DIR / Path("sample.odt")
self.parser.parse(test_file, "application/vnd.oasis.opendocument.text")
self.try_parse_with_wait(test_file, "application/vnd.oasis.opendocument.text")
self.assertEqual(
self.parser.text,
@@ -62,7 +101,7 @@ class TestTikaParserAgainstServer(TestCase):
"""
test_file = self.SAMPLE_DIR / Path("sample.docx")
self.parser.parse(
self.try_parse_with_wait(
test_file,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)