mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Merge remote-tracking branch 'paperless/dev' into feature-consume-eml
This commit is contained in:
		| @@ -405,6 +405,7 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|                 # Don't save with the lock active. Saving will cause the file | ||||
|                 # renaming logic to acquire the lock as well. | ||||
|                 # This triggers things like file renaming | ||||
|                 document.save() | ||||
|  | ||||
|                 # Delete the file only if it was successfully consumed | ||||
| @@ -438,6 +439,9 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|         self._send_progress(100, 100, "SUCCESS", MESSAGE_FINISHED, document.id) | ||||
|  | ||||
|         # Return the most up to date fields | ||||
|         document.refresh_from_db() | ||||
|  | ||||
|         return document | ||||
|  | ||||
|     def _store(self, text, date, mime_type) -> Document: | ||||
|   | ||||
| @@ -1,4 +1,3 @@ | ||||
| import datetime | ||||
| import logging | ||||
| import os | ||||
| from collections import defaultdict | ||||
| @@ -172,7 +171,7 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False): | ||||
|             else: | ||||
|                 asn = "-none-" | ||||
|  | ||||
|             # Convert UTC database date to localized date | ||||
|             # Convert UTC database datetime to localized date | ||||
|             local_added = timezone.localdate(doc.added) | ||||
|             local_created = timezone.localdate(doc.created) | ||||
|  | ||||
| @@ -180,14 +179,20 @@ def generate_filename(doc, counter=0, append_gpg=True, archive_filename=False): | ||||
|                 title=pathvalidate.sanitize_filename(doc.title, replacement_text="-"), | ||||
|                 correspondent=correspondent, | ||||
|                 document_type=document_type, | ||||
|                 created=datetime.date.isoformat(local_created), | ||||
|                 created_year=local_created.year, | ||||
|                 created_month=f"{local_created.month:02}", | ||||
|                 created_day=f"{local_created.day:02}", | ||||
|                 added=datetime.date.isoformat(local_added), | ||||
|                 added_year=local_added.year, | ||||
|                 added_month=f"{local_added.month:02}", | ||||
|                 added_day=f"{local_added.day:02}", | ||||
|                 created=local_created.isoformat(), | ||||
|                 created_year=local_created.strftime("%Y"), | ||||
|                 created_year_short=local_created.strftime("%y"), | ||||
|                 created_month=local_created.strftime("%m"), | ||||
|                 created_month_name=local_created.strftime("%B"), | ||||
|                 created_month_name_short=local_created.strftime("%b"), | ||||
|                 created_day=local_created.strftime("%d"), | ||||
|                 added=local_added.isoformat(), | ||||
|                 added_year=local_added.strftime("%Y"), | ||||
|                 added_year_short=local_added.strftime("%y"), | ||||
|                 added_month=local_added.strftime("%m"), | ||||
|                 added_month_name=local_added.strftime("%B"), | ||||
|                 added_month_name_short=local_added.strftime("%b"), | ||||
|                 added_day=local_added.strftime("%d"), | ||||
|                 asn=asn, | ||||
|                 tags=tags, | ||||
|                 tag_list=tag_list, | ||||
|   | ||||
| @@ -142,14 +142,14 @@ def matches(matching_model, document): | ||||
|         return bool(match) | ||||
|  | ||||
|     elif matching_model.matching_algorithm == MatchingModel.MATCH_FUZZY: | ||||
|         from fuzzywuzzy import fuzz | ||||
|         from rapidfuzz import fuzz | ||||
|  | ||||
|         match = re.sub(r"[^\w\s]", "", matching_model.match) | ||||
|         text = re.sub(r"[^\w\s]", "", document_content) | ||||
|         if matching_model.is_insensitive: | ||||
|             match = match.lower() | ||||
|             text = text.lower() | ||||
|         if fuzz.partial_ratio(match, text) >= 90: | ||||
|         if fuzz.partial_ratio(match, text, score_cutoff=90): | ||||
|             # TODO: make this better | ||||
|             log_reason( | ||||
|                 matching_model, | ||||
|   | ||||
| @@ -400,6 +400,13 @@ def update_filename_and_move_files(sender, instance, **kwargs): | ||||
|  | ||||
|     with FileLock(settings.MEDIA_LOCK): | ||||
|         try: | ||||
|  | ||||
|             # If this was waiting for the lock, the filename or archive_filename | ||||
|             # of this document may have been updated.  This happens if multiple updates | ||||
|             # get queued from the UI for the same document | ||||
|             # So freshen up the data before doing anything | ||||
|             instance.refresh_from_db() | ||||
|  | ||||
|             old_filename = instance.filename | ||||
|             old_source_path = instance.source_path | ||||
|  | ||||
|   | ||||
| @@ -14,6 +14,7 @@ except ImportError: | ||||
|     import backports.zoneinfo as zoneinfo | ||||
|  | ||||
| from django.conf import settings | ||||
| from django.utils import timezone | ||||
| from django.test import override_settings | ||||
| from django.test import TestCase | ||||
|  | ||||
| @@ -326,6 +327,12 @@ class TestConsumer(DirectoriesMixin, TestCase): | ||||
|     def testNormalOperation(self): | ||||
|  | ||||
|         filename = self.get_test_file() | ||||
|  | ||||
|         # Get the local time, as an aware datetime | ||||
|         # Roughly equal to file modification time | ||||
|         rough_create_date_local = timezone.localtime(timezone.now()) | ||||
|  | ||||
|         # Consume the file | ||||
|         document = self.consumer.try_consume_file(filename) | ||||
|  | ||||
|         self.assertEqual(document.content, "The Text") | ||||
| @@ -351,7 +358,20 @@ class TestConsumer(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         self._assert_first_last_send_progress() | ||||
|  | ||||
|         self.assertEqual(document.created.tzinfo, zoneinfo.ZoneInfo("America/Chicago")) | ||||
|         # Convert UTC time from DB to local time | ||||
|         document_date_local = timezone.localtime(document.created) | ||||
|  | ||||
|         self.assertEqual( | ||||
|             document_date_local.tzinfo, | ||||
|             zoneinfo.ZoneInfo("America/Chicago"), | ||||
|         ) | ||||
|         self.assertEqual(document_date_local.tzinfo, rough_create_date_local.tzinfo) | ||||
|         self.assertEqual(document_date_local.year, rough_create_date_local.year) | ||||
|         self.assertEqual(document_date_local.month, rough_create_date_local.month) | ||||
|         self.assertEqual(document_date_local.day, rough_create_date_local.day) | ||||
|         self.assertEqual(document_date_local.hour, rough_create_date_local.hour) | ||||
|         self.assertEqual(document_date_local.minute, rough_create_date_local.minute) | ||||
|         # Skipping seconds and more precise | ||||
|  | ||||
|     @override_settings(FILENAME_FORMAT=None) | ||||
|     def testDeleteMacFiles(self): | ||||
|   | ||||
| @@ -1036,6 +1036,34 @@ class TestFilenameGeneration(TestCase): | ||||
|         self.assertEqual(generate_filename(doc_a), "0000002.pdf") | ||||
|         self.assertEqual(generate_filename(doc_b), "SomeImportantNone/2020-07-25.pdf") | ||||
|  | ||||
|     @override_settings( | ||||
|         FILENAME_FORMAT="{created_year_short}/{created_month_name_short}/{created_month_name}/{title}", | ||||
|     ) | ||||
|     def test_short_names_created(self): | ||||
|         doc = Document.objects.create( | ||||
|             title="The Title", | ||||
|             created=timezone.make_aware( | ||||
|                 datetime.datetime(1989, 12, 21, 7, 36, 51, 153), | ||||
|             ), | ||||
|             mime_type="application/pdf", | ||||
|             pk=2, | ||||
|             checksum="2", | ||||
|         ) | ||||
|         self.assertEqual(generate_filename(doc), "89/Dec/December/The Title.pdf") | ||||
|  | ||||
|     @override_settings( | ||||
|         FILENAME_FORMAT="{added_year_short}/{added_month_name}/{added_month_name_short}/{title}", | ||||
|     ) | ||||
|     def test_short_names_added(self): | ||||
|         doc = Document.objects.create( | ||||
|             title="The Title", | ||||
|             added=timezone.make_aware(datetime.datetime(1984, 8, 21, 7, 36, 51, 153)), | ||||
|             mime_type="application/pdf", | ||||
|             pk=2, | ||||
|             checksum="2", | ||||
|         ) | ||||
|         self.assertEqual(generate_filename(doc), "84/August/Aug/The Title.pdf") | ||||
|  | ||||
|  | ||||
| def run(): | ||||
|     doc = Document.objects.create( | ||||
|   | ||||
| @@ -347,6 +347,13 @@ if os.getenv("PAPERLESS_DBHOST"): | ||||
|     if os.getenv("PAPERLESS_DBENGINE") == "mariadb": | ||||
|         engine = "django.db.backends.mysql" | ||||
|         options = {"read_default_file": "/etc/mysql/my.cnf", "charset": "utf8mb4"} | ||||
|  | ||||
|         # Silence Django error on old MariaDB versions. | ||||
|         # VARCHAR can support > 255 in modern versions | ||||
|         # https://docs.djangoproject.com/en/4.1/ref/checks/#database | ||||
|         # https://mariadb.com/kb/en/innodb-system-variables/#innodb_large_prefix | ||||
|         SILENCED_SYSTEM_CHECKS = ["mysql.W003"] | ||||
|  | ||||
|     else:  # Default to PostgresDB | ||||
|         engine = "django.db.backends.postgresql_psycopg2" | ||||
|         options = {"sslmode": os.getenv("PAPERLESS_DBSSLMODE", "prefer")} | ||||
|   | ||||
| @@ -1,10 +1,11 @@ | ||||
| import datetime | ||||
| import os | ||||
| import time | ||||
| from pathlib import Path | ||||
| from typing import Final | ||||
|  | ||||
| import pytest | ||||
| from django.test import TestCase | ||||
| from documents.parsers import ParseError | ||||
| from paperless_tika.parsers import TikaDocumentParser | ||||
|  | ||||
|  | ||||
| @@ -24,6 +25,44 @@ class TestTikaParserAgainstServer(TestCase): | ||||
|     def tearDown(self) -> None: | ||||
|         self.parser.cleanup() | ||||
|  | ||||
|     def try_parse_with_wait(self, test_file, mime_type): | ||||
|         """ | ||||
|         For whatever reason, the image started during the test pipeline likes to | ||||
|         segfault sometimes, when run with the exact files that usually pass. | ||||
|  | ||||
|         So, this function will retry the parsing up to 3 times, with larger backoff | ||||
|         periods between each attempt, in hopes the issue resolves itself during | ||||
|         one attempt to parse. | ||||
|  | ||||
|         This will wait the following: | ||||
|             - Attempt 1 - 20s following failure | ||||
|             - Attempt 2 - 40s following failure | ||||
|             - Attempt 3 - 80s following failure | ||||
|  | ||||
|         """ | ||||
|         succeeded = False | ||||
|         retry_time = 20.0 | ||||
|         retry_count = 0 | ||||
|         max_retry_count = 3 | ||||
|  | ||||
|         while retry_count < max_retry_count and not succeeded: | ||||
|             try: | ||||
|                 self.parser.parse(test_file, mime_type) | ||||
|  | ||||
|                 succeeded = True | ||||
|             except Exception as e: | ||||
|                 print(f"{e} during try #{retry_count}", flush=True) | ||||
|  | ||||
|                 retry_count = retry_count + 1 | ||||
|  | ||||
|                 time.sleep(retry_time) | ||||
|                 retry_time = retry_time * 2.0 | ||||
|  | ||||
|         self.assertTrue( | ||||
|             succeeded, | ||||
|             "Continued Tika server errors after multiple retries", | ||||
|         ) | ||||
|  | ||||
|     def test_basic_parse_odt(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
| @@ -36,7 +75,7 @@ class TestTikaParserAgainstServer(TestCase): | ||||
|         """ | ||||
|         test_file = self.SAMPLE_DIR / Path("sample.odt") | ||||
|  | ||||
|         self.parser.parse(test_file, "application/vnd.oasis.opendocument.text") | ||||
|         self.try_parse_with_wait(test_file, "application/vnd.oasis.opendocument.text") | ||||
|  | ||||
|         self.assertEqual( | ||||
|             self.parser.text, | ||||
| @@ -62,7 +101,7 @@ class TestTikaParserAgainstServer(TestCase): | ||||
|         """ | ||||
|         test_file = self.SAMPLE_DIR / Path("sample.docx") | ||||
|  | ||||
|         self.parser.parse( | ||||
|         self.try_parse_with_wait( | ||||
|             test_file, | ||||
|             "application/vnd.openxmlformats-officedocument.wordprocessingml.document", | ||||
|         ) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 phail
					phail