mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2025-07-28 18:24:38 -05:00
Merge remote-tracking branch 'paperless/dev' into feature-consume-eml
This commit is contained in:
@@ -405,6 +405,7 @@ class Consumer(LoggingMixin):
|
||||
|
||||
# Don't save with the lock active. Saving will cause the file
|
||||
# renaming logic to acquire the lock as well.
|
||||
# This triggers things like file renaming
|
||||
document.save()
|
||||
|
||||
# Delete the file only if it was successfully consumed
|
||||
@@ -438,6 +439,9 @@ class Consumer(LoggingMixin):
|
||||
|
||||
self._send_progress(100, 100, "SUCCESS", MESSAGE_FINISHED, document.id)
|
||||
|
||||
# Return the most up to date fields
|
||||
document.refresh_from_db()
|
||||
|
||||
return document
|
||||
|
||||
def _store(self, text, date, mime_type) -> Document:
|
||||
|
@@ -1,4 +1,3 @@
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
from collections import defaultdict
|
||||
@@ -172,7 +171,7 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
|
||||
else:
|
||||
asn = "-none-"
|
||||
|
||||
# Convert UTC database date to localized date
|
||||
# Convert UTC database datetime to localized date
|
||||
local_added = timezone.localdate(doc.added)
|
||||
local_created = timezone.localdate(doc.created)
|
||||
|
||||
@@ -180,14 +179,20 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False):
|
||||
title=pathvalidate.sanitize_filename(doc.title, replacement_text="-"),
|
||||
correspondent=correspondent,
|
||||
document_type=document_type,
|
||||
created=datetime.date.isoformat(local_created),
|
||||
created_year=local_created.year,
|
||||
created_month=f"{local_created.month:02}",
|
||||
created_day=f"{local_created.day:02}",
|
||||
added=datetime.date.isoformat(local_added),
|
||||
added_year=local_added.year,
|
||||
added_month=f"{local_added.month:02}",
|
||||
added_day=f"{local_added.day:02}",
|
||||
created=local_created.isoformat(),
|
||||
created_year=local_created.strftime("%Y"),
|
||||
created_year_short=local_created.strftime("%y"),
|
||||
created_month=local_created.strftime("%m"),
|
||||
created_month_name=local_created.strftime("%B"),
|
||||
created_month_name_short=local_created.strftime("%b"),
|
||||
created_day=local_created.strftime("%d"),
|
||||
added=local_added.isoformat(),
|
||||
added_year=local_added.strftime("%Y"),
|
||||
added_year_short=local_added.strftime("%y"),
|
||||
added_month=local_added.strftime("%m"),
|
||||
added_month_name=local_added.strftime("%B"),
|
||||
added_month_name_short=local_added.strftime("%b"),
|
||||
added_day=local_added.strftime("%d"),
|
||||
asn=asn,
|
||||
tags=tags,
|
||||
tag_list=tag_list,
|
||||
|
@@ -142,14 +142,14 @@ def matches(matching_model, document):
|
||||
return bool(match)
|
||||
|
||||
elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY:
|
||||
from fuzzywuzzy import fuzz
|
||||
from rapidfuzz import fuzz
|
||||
|
||||
match = re.sub(r"[^\w\s]", "", matching_model.match)
|
||||
text = re.sub(r"[^\w\s]", "", document_content)
|
||||
if matching_model.is_insensitive:
|
||||
match = match.lower()
|
||||
text = text.lower()
|
||||
if fuzz.partial_ratio(match, text) >= 90:
|
||||
if fuzz.partial_ratio(match, text, score_cutoff=90):
|
||||
# TODO: make this better
|
||||
log_reason(
|
||||
matching_model,
|
||||
|
@@ -400,6 +400,13 @@ def update_filename_and_move_files(sender, instance, **kwargs):
|
||||
|
||||
with FileLock(settings.MEDIA_LOCK):
|
||||
try:
|
||||
|
||||
# If this was waiting for the lock, the filename or archive_filename
|
||||
# of this document may have been updated. This happens if multiple updates
|
||||
# get queued from the UI for the same document
|
||||
# So freshen up the data before doing anything
|
||||
instance.refresh_from_db()
|
||||
|
||||
old_filename = instance.filename
|
||||
old_source_path = instance.source_path
|
||||
|
||||
|
@@ -14,6 +14,7 @@ except ImportError:
|
||||
import backports.zoneinfo as zoneinfo
|
||||
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from django.test import override_settings
|
||||
from django.test import TestCase
|
||||
|
||||
@@ -326,6 +327,12 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
def testNormalOperation(self):
|
||||
|
||||
filename = self.get_test_file()
|
||||
|
||||
# Get the local time, as an aware datetime
|
||||
# Roughly equal to file modification time
|
||||
rough_create_date_local = timezone.localtime(timezone.now())
|
||||
|
||||
# Consume the file
|
||||
document = self.consumer.try_consume_file(filename)
|
||||
|
||||
self.assertEqual(document.content, "The Text")
|
||||
@@ -351,7 +358,20 @@ class TestConsumer(DirectoriesMixin, TestCase):
|
||||
|
||||
self._assert_first_last_send_progress()
|
||||
|
||||
self.assertEqual(document.created.tzinfo, zoneinfo.ZoneInfo("America/Chicago"))
|
||||
# Convert UTC time from DB to local time
|
||||
document_date_local = timezone.localtime(document.created)
|
||||
|
||||
self.assertEqual(
|
||||
document_date_local.tzinfo,
|
||||
zoneinfo.ZoneInfo("America/Chicago"),
|
||||
)
|
||||
self.assertEqual(document_date_local.tzinfo, rough_create_date_local.tzinfo)
|
||||
self.assertEqual(document_date_local.year, rough_create_date_local.year)
|
||||
self.assertEqual(document_date_local.month, rough_create_date_local.month)
|
||||
self.assertEqual(document_date_local.day, rough_create_date_local.day)
|
||||
self.assertEqual(document_date_local.hour, rough_create_date_local.hour)
|
||||
self.assertEqual(document_date_local.minute, rough_create_date_local.minute)
|
||||
# Skipping seconds and more precise
|
||||
|
||||
@override_settings(FILENAME_FORMAT=None)
|
||||
def testDeleteMacFiles(self):
|
||||
|
@@ -1036,6 +1036,34 @@ class TestFilenameGeneration(TestCase):
|
||||
self.assertEqual(generate_filename(doc_a), "0000002.pdf")
|
||||
self.assertEqual(generate_filename(doc_b), "SomeImportantNone/2020-07-25.pdf")
|
||||
|
||||
@override_settings(
|
||||
FILENAME_FORMAT="{created_year_short}/{created_month_name_short}/{created_month_name}/{title}",
|
||||
)
|
||||
def test_short_names_created(self):
|
||||
doc = Document.objects.create(
|
||||
title="The Title",
|
||||
created=timezone.make_aware(
|
||||
datetime.datetime(1989, 12, 21, 7, 36, 51, 153),
|
||||
),
|
||||
mime_type="application/pdf",
|
||||
pk=2,
|
||||
checksum="2",
|
||||
)
|
||||
self.assertEqual(generate_filename(doc), "89/Dec/December/The Title.pdf")
|
||||
|
||||
@override_settings(
|
||||
FILENAME_FORMAT="{added_year_short}/{added_month_name}/{added_month_name_short}/{title}",
|
||||
)
|
||||
def test_short_names_added(self):
|
||||
doc = Document.objects.create(
|
||||
title="The Title",
|
||||
added=timezone.make_aware(datetime.datetime(1984, 8, 21, 7, 36, 51, 153)),
|
||||
mime_type="application/pdf",
|
||||
pk=2,
|
||||
checksum="2",
|
||||
)
|
||||
self.assertEqual(generate_filename(doc), "84/August/Aug/The Title.pdf")
|
||||
|
||||
|
||||
def run():
|
||||
doc = Document.objects.create(
|
||||
|
@@ -347,6 +347,13 @@ if os.getenv("PAPERLESS_DBHOST"):
|
||||
if os.getenv("PAPERLESS_DBENGINE") == "mariadb":
|
||||
engine = "django.db.backends.mysql"
|
||||
options = {"read_default_file": "/etc/mysql/my.cnf", "charset": "utf8mb4"}
|
||||
|
||||
# Silence Django error on old MariaDB versions.
|
||||
# VARCHAR can support > 255 in modern versions
|
||||
# https://docs.djangoproject.com/en/4.1/ref/checks/#database
|
||||
# https://mariadb.com/kb/en/innodb-system-variables/#innodb_large_prefix
|
||||
SILENCED_SYSTEM_CHECKS = ["mysql.W003"]
|
||||
|
||||
else: # Default to PostgresDB
|
||||
engine = "django.db.backends.postgresql_psycopg2"
|
||||
options = {"sslmode": os.getenv("PAPERLESS_DBSSLMODE", "prefer")}
|
||||
|
@@ -1,10 +1,11 @@
|
||||
import datetime
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Final
|
||||
|
||||
import pytest
|
||||
from django.test import TestCase
|
||||
from documents.parsers import ParseError
|
||||
from paperless_tika.parsers import TikaDocumentParser
|
||||
|
||||
|
||||
@@ -24,6 +25,44 @@ class TestTikaParserAgainstServer(TestCase):
|
||||
def tearDown(self) -> None:
|
||||
self.parser.cleanup()
|
||||
|
||||
def try_parse_with_wait(self, test_file, mime_type):
|
||||
"""
|
||||
For whatever reason, the image started during the test pipeline likes to
|
||||
segfault sometimes, when run with the exact files that usually pass.
|
||||
|
||||
So, this function will retry the parsing up to 3 times, with larger backoff
|
||||
periods between each attempt, in hopes the issue resolves itself during
|
||||
one attempt to parse.
|
||||
|
||||
This will wait the following:
|
||||
- Attempt 1 - 20s following failure
|
||||
- Attempt 2 - 40s following failure
|
||||
- Attempt 3 - 80s following failure
|
||||
|
||||
"""
|
||||
succeeded = False
|
||||
retry_time = 20.0
|
||||
retry_count = 0
|
||||
max_retry_count = 3
|
||||
|
||||
while retry_count < max_retry_count and not succeeded:
|
||||
try:
|
||||
self.parser.parse(test_file, mime_type)
|
||||
|
||||
succeeded = True
|
||||
except Exception as e:
|
||||
print(f"{e} during try #{retry_count}", flush=True)
|
||||
|
||||
retry_count = retry_count + 1
|
||||
|
||||
time.sleep(retry_time)
|
||||
retry_time = retry_time * 2.0
|
||||
|
||||
self.assertTrue(
|
||||
succeeded,
|
||||
"Continued Tika server errors after multiple retries",
|
||||
)
|
||||
|
||||
def test_basic_parse_odt(self):
|
||||
"""
|
||||
GIVEN:
|
||||
@@ -36,7 +75,7 @@ class TestTikaParserAgainstServer(TestCase):
|
||||
"""
|
||||
test_file = self.SAMPLE_DIR / Path("sample.odt")
|
||||
|
||||
self.parser.parse(test_file, "application/vnd.oasis.opendocument.text")
|
||||
self.try_parse_with_wait(test_file, "application/vnd.oasis.opendocument.text")
|
||||
|
||||
self.assertEqual(
|
||||
self.parser.text,
|
||||
@@ -62,7 +101,7 @@ class TestTikaParserAgainstServer(TestCase):
|
||||
"""
|
||||
test_file = self.SAMPLE_DIR / Path("sample.docx")
|
||||
|
||||
self.parser.parse(
|
||||
self.try_parse_with_wait(
|
||||
test_file,
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
)
|
||||
|
Reference in New Issue
Block a user