mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-28 03:46:06 -05:00 
			
		
		
		
	Adds additional testing for both date parsing and consumed document created date
This commit is contained in:
		| @@ -3,6 +3,8 @@ import hashlib | ||||
| import os | ||||
| import uuid | ||||
| from subprocess import Popen | ||||
| from typing import Optional | ||||
| from typing import Type | ||||
|  | ||||
| import magic | ||||
| from asgiref.sync import async_to_sync | ||||
| @@ -23,6 +25,7 @@ from .models import Document | ||||
| from .models import DocumentType | ||||
| from .models import FileInfo | ||||
| from .models import Tag | ||||
| from .parsers import DocumentParser | ||||
| from .parsers import get_parser_class_for_mime_type | ||||
| from .parsers import parse_date | ||||
| from .parsers import ParseError | ||||
| @@ -186,7 +189,7 @@ class Consumer(LoggingMixin): | ||||
|         override_document_type_id=None, | ||||
|         override_tag_ids=None, | ||||
|         task_id=None, | ||||
|     ): | ||||
|     ) -> Document: | ||||
|         """ | ||||
|         Return the document object if it was successfully created. | ||||
|         """ | ||||
| @@ -220,7 +223,10 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|         self.log("debug", f"Detected mime type: {mime_type}") | ||||
|  | ||||
|         parser_class = get_parser_class_for_mime_type(mime_type) | ||||
|         # Based on the mime type, get the parser for that type | ||||
|         parser_class: Optional[Type[DocumentParser]] = get_parser_class_for_mime_type( | ||||
|             mime_type, | ||||
|         ) | ||||
|         if not parser_class: | ||||
|             self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}") | ||||
|  | ||||
| @@ -241,7 +247,10 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|         # This doesn't parse the document yet, but gives us a parser. | ||||
|  | ||||
|         document_parser = parser_class(self.logging_group, progress_callback) | ||||
|         document_parser: DocumentParser = parser_class( | ||||
|             self.logging_group, | ||||
|             progress_callback, | ||||
|         ) | ||||
|  | ||||
|         self.log("debug", f"Parser: {type(document_parser).__name__}") | ||||
|  | ||||
| @@ -270,7 +279,7 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|             text = document_parser.get_text() | ||||
|             date = document_parser.get_date() | ||||
|             if not date: | ||||
|             if date is None: | ||||
|                 self._send_progress(90, 100, "WORKING", MESSAGE_PARSE_DATE) | ||||
|                 date = parse_date(self.filename, text) | ||||
|             archive_path = document_parser.get_archive_path() | ||||
| @@ -342,7 +351,7 @@ class Consumer(LoggingMixin): | ||||
|                             ).hexdigest() | ||||
|  | ||||
|                 # Don't save with the lock active. Saving will cause the file | ||||
|                 # renaming logic to aquire the lock as well. | ||||
|                 # renaming logic to acquire the lock as well. | ||||
|                 document.save() | ||||
|  | ||||
|                 # Delete the file only if it was successfully consumed | ||||
| @@ -362,7 +371,8 @@ class Consumer(LoggingMixin): | ||||
|         except Exception as e: | ||||
|             self._fail( | ||||
|                 str(e), | ||||
|                 f"The following error occured while consuming " f"{self.filename}: {e}", | ||||
|                 f"The following error occurred while consuming " | ||||
|                 f"{self.filename}: {e}", | ||||
|                 exc_info=True, | ||||
|             ) | ||||
|         finally: | ||||
| @@ -376,21 +386,26 @@ class Consumer(LoggingMixin): | ||||
|  | ||||
|         return document | ||||
|  | ||||
|     def _store(self, text, date, mime_type): | ||||
|     def _store(self, text, date, mime_type) -> Document: | ||||
|  | ||||
|         # If someone gave us the original filename, use it instead of doc. | ||||
|  | ||||
|         file_info = FileInfo.from_filename(self.filename) | ||||
|  | ||||
|         stats = os.stat(self.path) | ||||
|  | ||||
|         self.log("debug", "Saving record to database") | ||||
|  | ||||
|         created = ( | ||||
|             file_info.created | ||||
|             or date | ||||
|             or timezone.make_aware(datetime.datetime.fromtimestamp(stats.st_mtime)) | ||||
|         ) | ||||
|         if file_info.created is not None: | ||||
|             create_date = file_info.created | ||||
|             self.log("debug", f"Creation date from FileInfo: {create_date}") | ||||
|         elif date is not None: | ||||
|             create_date = date | ||||
|             self.log("debug", f"Creation date from parse_date: {create_date}") | ||||
|         else: | ||||
|             stats = os.stat(self.path) | ||||
|             create_date = timezone.make_aware( | ||||
|                 datetime.datetime.fromtimestamp(stats.st_mtime), | ||||
|             ) | ||||
|             self.log("debug", "Creation date from st_mtime: {create_date}") | ||||
|  | ||||
|         storage_type = Document.STORAGE_TYPE_UNENCRYPTED | ||||
|  | ||||
| @@ -400,8 +415,8 @@ class Consumer(LoggingMixin): | ||||
|                 content=text, | ||||
|                 mime_type=mime_type, | ||||
|                 checksum=hashlib.md5(f.read()).hexdigest(), | ||||
|                 created=created, | ||||
|                 modified=created, | ||||
|                 created=create_date, | ||||
|                 modified=create_date, | ||||
|                 storage_type=storage_type, | ||||
|             ) | ||||
|  | ||||
|   | ||||
| @@ -380,6 +380,10 @@ class SavedViewFilterRule(models.Model): | ||||
|  | ||||
|  | ||||
| # TODO: why is this in the models file? | ||||
| # TODO: how about, what is this and where is it documented? | ||||
| # It appears to parsing JSON from an environment variable to get a title and date from | ||||
| # the filename, if possible, as a higher priority than either document filename or | ||||
| # content parsing | ||||
| class FileInfo: | ||||
|  | ||||
|     REGEXES = OrderedDict( | ||||
| @@ -387,8 +391,7 @@ class FileInfo: | ||||
|             ( | ||||
|                 "created-title", | ||||
|                 re.compile( | ||||
|                     r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " | ||||
|                     r"(?P<title>.*)$", | ||||
|                     r"^(?P<created>\d{8}(\d{6})?Z) - " r"(?P<title>.*)$", | ||||
|                     flags=re.IGNORECASE, | ||||
|                 ), | ||||
|             ), | ||||
| @@ -428,7 +431,7 @@ class FileInfo: | ||||
|             properties[name] = getattr(cls, "_get_{}".format(name))(properties[name]) | ||||
|  | ||||
|     @classmethod | ||||
|     def from_filename(cls, filename): | ||||
|     def from_filename(cls, filename) -> "FileInfo": | ||||
|         # Mutate filename in-place before parsing its components | ||||
|         # by applying at most one of the configured transformations. | ||||
|         for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS: | ||||
|   | ||||
| @@ -1,3 +1,4 @@ | ||||
| import datetime | ||||
| import logging | ||||
| import mimetypes | ||||
| import os | ||||
| @@ -5,6 +6,8 @@ import re | ||||
| import shutil | ||||
| import subprocess | ||||
| import tempfile | ||||
| from typing import Optional | ||||
| from typing import Set | ||||
|  | ||||
| import magic | ||||
| from django.conf import settings | ||||
| @@ -40,11 +43,11 @@ DATE_REGEX = re.compile( | ||||
| logger = logging.getLogger("paperless.parsing") | ||||
|  | ||||
|  | ||||
| def is_mime_type_supported(mime_type): | ||||
| def is_mime_type_supported(mime_type) -> bool: | ||||
|     return get_parser_class_for_mime_type(mime_type) is not None | ||||
|  | ||||
|  | ||||
| def get_default_file_extension(mime_type): | ||||
| def get_default_file_extension(mime_type) -> str: | ||||
|     for response in document_consumer_declaration.send(None): | ||||
|         parser_declaration = response[1] | ||||
|         supported_mime_types = parser_declaration["mime_types"] | ||||
| @@ -59,14 +62,14 @@ def get_default_file_extension(mime_type): | ||||
|         return "" | ||||
|  | ||||
|  | ||||
| def is_file_ext_supported(ext): | ||||
| def is_file_ext_supported(ext) -> bool: | ||||
|     if ext: | ||||
|         return ext.lower() in get_supported_file_extensions() | ||||
|     else: | ||||
|         return False | ||||
|  | ||||
|  | ||||
| def get_supported_file_extensions(): | ||||
| def get_supported_file_extensions() -> Set[str]: | ||||
|     extensions = set() | ||||
|     for response in document_consumer_declaration.send(None): | ||||
|         parser_declaration = response[1] | ||||
| @@ -121,7 +124,7 @@ def run_convert( | ||||
|     auto_orient=False, | ||||
|     extra=None, | ||||
|     logging_group=None, | ||||
| ): | ||||
| ) -> None: | ||||
|  | ||||
|     environment = os.environ.copy() | ||||
|     if settings.CONVERT_MEMORY_LIMIT: | ||||
| @@ -146,11 +149,11 @@ def run_convert( | ||||
|         raise ParseError("Convert failed at {}".format(args)) | ||||
|  | ||||
|  | ||||
| def get_default_thumbnail(): | ||||
| def get_default_thumbnail() -> str: | ||||
|     return os.path.join(os.path.dirname(__file__), "resources", "document.png") | ||||
|  | ||||
|  | ||||
| def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None): | ||||
| def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> str: | ||||
|     out_path = os.path.join(temp_dir, "convert_gs.png") | ||||
|  | ||||
|     # if convert fails, fall back to extracting | ||||
| @@ -184,7 +187,7 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None): | ||||
|         return get_default_thumbnail() | ||||
|  | ||||
|  | ||||
| def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None): | ||||
| def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str: | ||||
|     """ | ||||
|     The thumbnail of a PDF is just a 500px wide image of the first page. | ||||
|     """ | ||||
| @@ -209,12 +212,12 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None): | ||||
|     return out_path | ||||
|  | ||||
|  | ||||
| def parse_date(filename, text): | ||||
| def parse_date(filename, text) -> Optional[datetime.datetime]: | ||||
|     """ | ||||
|     Returns the date of the document. | ||||
|     """ | ||||
|  | ||||
|     def __parser(ds, date_order): | ||||
|     def __parser(ds: str, date_order: str) -> datetime.datetime: | ||||
|         """ | ||||
|         Call dateparser.parse with a particular date ordering | ||||
|         """ | ||||
| @@ -230,9 +233,9 @@ def parse_date(filename, text): | ||||
|             }, | ||||
|         ) | ||||
|  | ||||
|     def __filter(date): | ||||
|     def __filter(date: datetime.datetime) -> Optional[datetime.datetime]: | ||||
|         if ( | ||||
|             date | ||||
|             date is not None | ||||
|             and date.year > 1900 | ||||
|             and date <= timezone.now() | ||||
|             and date.date() not in settings.IGNORE_DATES | ||||
| @@ -244,8 +247,10 @@ def parse_date(filename, text): | ||||
|  | ||||
|     # if filename date parsing is enabled, search there first: | ||||
|     if settings.FILENAME_DATE_ORDER: | ||||
|         logger.info("Attempting parsing from filename") | ||||
|         for m in re.finditer(DATE_REGEX, filename): | ||||
|             date_string = m.group(0) | ||||
|             logger.info(f"Found potential date: {date_string}") | ||||
|  | ||||
|             try: | ||||
|                 date = __parser(date_string, settings.FILENAME_DATE_ORDER) | ||||
| @@ -255,11 +260,16 @@ def parse_date(filename, text): | ||||
|  | ||||
|             date = __filter(date) | ||||
|             if date is not None: | ||||
|                 logger.info(f"Found date: {date}") | ||||
|                 return date | ||||
|             else: | ||||
|                 logger.info("Filtered date out") | ||||
|  | ||||
|     logger.info("Attempting parsing from content") | ||||
|     # Iterate through all regex matches in text and try to parse the date | ||||
|     for m in re.finditer(DATE_REGEX, text): | ||||
|         date_string = m.group(0) | ||||
|         logger.info(f"Found potential date: {date_string}") | ||||
|  | ||||
|         try: | ||||
|             date = __parser(date_string, settings.DATE_ORDER) | ||||
| @@ -269,7 +279,10 @@ def parse_date(filename, text): | ||||
|  | ||||
|         date = __filter(date) | ||||
|         if date is not None: | ||||
|             break | ||||
|             logger.info(f"Found date: {date}") | ||||
|             return date | ||||
|         else: | ||||
|             logger.info("Filtered date out") | ||||
|  | ||||
|     return date | ||||
|  | ||||
| @@ -294,7 +307,7 @@ class DocumentParser(LoggingMixin): | ||||
|  | ||||
|         self.archive_path = None | ||||
|         self.text = None | ||||
|         self.date = None | ||||
|         self.date: Optional[datetime.datetime] = None | ||||
|         self.progress_callback = progress_callback | ||||
|  | ||||
|     def progress(self, current_progress, max_progress): | ||||
| @@ -342,7 +355,7 @@ class DocumentParser(LoggingMixin): | ||||
|     def get_text(self): | ||||
|         return self.text | ||||
|  | ||||
|     def get_date(self): | ||||
|     def get_date(self) -> Optional[datetime.datetime]: | ||||
|         return self.date | ||||
|  | ||||
|     def cleanup(self): | ||||
|   | ||||
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/documents/originals/0000005.pdf
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/documents/originals/0000005.pdf
									
									
									
									
									
										Executable file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/documents/originals/0000006.pdf
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/documents/originals/0000006.pdf
									
									
									
									
									
										Executable file
									
								
							
										
											Binary file not shown.
										
									
								
							| @@ -1,3 +1,4 @@ | ||||
| import datetime | ||||
| import os | ||||
| import re | ||||
| import shutil | ||||
| @@ -5,6 +6,8 @@ import tempfile | ||||
| from unittest import mock | ||||
| from unittest.mock import MagicMock | ||||
|  | ||||
| from dateutil import tz | ||||
|  | ||||
| try: | ||||
|     import zoneinfo | ||||
| except ImportError: | ||||
| @@ -502,7 +505,7 @@ class TestConsumer(DirectoriesMixin, TestCase): | ||||
|  | ||||
|         self.assertRaisesMessage( | ||||
|             ConsumerError, | ||||
|             "sample.pdf: The following error occured while consuming sample.pdf: NO.", | ||||
|             "sample.pdf: The following error occurred while consuming sample.pdf: NO.", | ||||
|             self.consumer.try_consume_file, | ||||
|             filename, | ||||
|         ) | ||||
| @@ -654,6 +657,127 @@ class TestConsumer(DirectoriesMixin, TestCase): | ||||
|         sanity_check() | ||||
|  | ||||
|  | ||||
| @mock.patch("documents.consumer.magic.from_file", fake_magic_from_file) | ||||
| class TestConsumerCreatedDate(DirectoriesMixin, TestCase): | ||||
|     def setUp(self): | ||||
|         super(TestConsumerCreatedDate, self).setUp() | ||||
|  | ||||
|         # this prevents websocket message reports during testing. | ||||
|         patcher = mock.patch("documents.consumer.Consumer._send_progress") | ||||
|         self._send_progress = patcher.start() | ||||
|         self.addCleanup(patcher.stop) | ||||
|  | ||||
|         self.consumer = Consumer() | ||||
|  | ||||
|     def test_consume_date_from_content(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - File content with date in DMY (default) format | ||||
|  | ||||
|         THEN: | ||||
|             - Should parse the date from the file content | ||||
|         """ | ||||
|         src = os.path.join( | ||||
|             os.path.dirname(__file__), | ||||
|             "samples", | ||||
|             "documents", | ||||
|             "originals", | ||||
|             "0000005.pdf", | ||||
|         ) | ||||
|         dst = os.path.join(self.dirs.scratch_dir, "sample.pdf") | ||||
|         shutil.copy(src, dst) | ||||
|  | ||||
|         document = self.consumer.try_consume_file(dst) | ||||
|  | ||||
|         self.assertEqual( | ||||
|             document.created, | ||||
|             datetime.datetime(1996, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)), | ||||
|         ) | ||||
|  | ||||
|     @override_settings(FILENAME_DATE_ORDER="YMD") | ||||
|     def test_consume_date_from_filename(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - File content with date in DMY (default) format | ||||
|             - Filename with date in YMD format | ||||
|  | ||||
|         THEN: | ||||
|             - Should parse the date from the filename | ||||
|         """ | ||||
|         src = os.path.join( | ||||
|             os.path.dirname(__file__), | ||||
|             "samples", | ||||
|             "documents", | ||||
|             "originals", | ||||
|             "0000005.pdf", | ||||
|         ) | ||||
|         dst = os.path.join(self.dirs.scratch_dir, "Scan - 2022-02-01.pdf") | ||||
|         shutil.copy(src, dst) | ||||
|  | ||||
|         document = self.consumer.try_consume_file(dst) | ||||
|  | ||||
|         self.assertEqual( | ||||
|             document.created, | ||||
|             datetime.datetime(2022, 2, 1, tzinfo=tz.gettz(settings.TIME_ZONE)), | ||||
|         ) | ||||
|  | ||||
|     def test_consume_date_filename_date_use_content(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - File content with date in DMY (default) format | ||||
|             - Filename date parsing disabled | ||||
|             - Filename with date in YMD format | ||||
|  | ||||
|         THEN: | ||||
|             - Should parse the date from the content | ||||
|         """ | ||||
|         src = os.path.join( | ||||
|             os.path.dirname(__file__), | ||||
|             "samples", | ||||
|             "documents", | ||||
|             "originals", | ||||
|             "0000005.pdf", | ||||
|         ) | ||||
|         dst = os.path.join(self.dirs.scratch_dir, "Scan - 2022-02-01.pdf") | ||||
|         shutil.copy(src, dst) | ||||
|  | ||||
|         document = self.consumer.try_consume_file(dst) | ||||
|  | ||||
|         self.assertEqual( | ||||
|             document.created, | ||||
|             datetime.datetime(1996, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)), | ||||
|         ) | ||||
|  | ||||
|     @override_settings( | ||||
|         IGNORE_DATES=(datetime.date(2010, 12, 13), datetime.date(2011, 11, 12)), | ||||
|     ) | ||||
|     def test_consume_date_use_content_with_ignore(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - File content with dates in DMY (default) format | ||||
|             - File content includes ignored dates | ||||
|  | ||||
|         THEN: | ||||
|             - Should parse the date from the filename | ||||
|         """ | ||||
|         src = os.path.join( | ||||
|             os.path.dirname(__file__), | ||||
|             "samples", | ||||
|             "documents", | ||||
|             "originals", | ||||
|             "0000006.pdf", | ||||
|         ) | ||||
|         dst = os.path.join(self.dirs.scratch_dir, "0000006.pdf") | ||||
|         shutil.copy(src, dst) | ||||
|  | ||||
|         document = self.consumer.try_consume_file(dst) | ||||
|  | ||||
|         self.assertEqual( | ||||
|             document.created, | ||||
|             datetime.datetime(1997, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)), | ||||
|         ) | ||||
|  | ||||
|  | ||||
| class PreConsumeTestCase(TestCase): | ||||
|     @mock.patch("documents.consumer.Popen") | ||||
|     @override_settings(PRE_CONSUME_SCRIPT=None) | ||||
|   | ||||
| @@ -8,6 +8,7 @@ from django.conf import settings | ||||
| from django.test import override_settings | ||||
| from django.test import TestCase | ||||
| from documents.parsers import parse_date | ||||
| from paperless.settings import DATE_ORDER | ||||
|  | ||||
|  | ||||
| class TestDate(TestCase): | ||||
| @@ -160,19 +161,112 @@ class TestDate(TestCase): | ||||
|     def test_crazy_date_with_spaces(self, *args): | ||||
|         self.assertIsNone(parse_date("", "20 408000l 2475")) | ||||
|  | ||||
|     @override_settings(FILENAME_DATE_ORDER="YMD") | ||||
|     def test_filename_date_parse_valid_ymd(self, *args): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - Date parsing from the filename is enabled | ||||
|             - Filename date format is with Year Month Day (YMD) | ||||
|             - Filename contains date matching the format | ||||
|  | ||||
|         THEN: | ||||
|             - Should parse the date from the filename | ||||
|         """ | ||||
|         self.assertEqual( | ||||
|             parse_date("/tmp/Scan-2022-04-01.pdf", "No date in here"), | ||||
|             datetime.datetime(2022, 4, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), | ||||
|         ) | ||||
|  | ||||
|     @override_settings(FILENAME_DATE_ORDER="DMY") | ||||
|     def test_filename_date_parse_valid_dmy(self, *args): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - Date parsing from the filename is enabled | ||||
|             - Filename date format is with Day Month Year (DMY) | ||||
|             - Filename contains date matching the format | ||||
|  | ||||
|         THEN: | ||||
|             - Should parse the date from the filename | ||||
|         """ | ||||
|         self.assertEqual( | ||||
|             parse_date("/tmp/Scan-10.01.2021.pdf", "No date in here"), | ||||
|             datetime.datetime(2021, 1, 10, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), | ||||
|         ) | ||||
|  | ||||
|     @override_settings(FILENAME_DATE_ORDER="YMD") | ||||
|     def test_filename_date_parse_invalid(self, *args): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - Date parsing from the filename is enabled | ||||
|             - Filename includes no date | ||||
|             - File content includes no date | ||||
|  | ||||
|         THEN: | ||||
|             - No date is parsed | ||||
|         """ | ||||
|         self.assertIsNone( | ||||
|             parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"), | ||||
|         ) | ||||
|  | ||||
|     @override_settings( | ||||
|         FILENAME_DATE_ORDER="YMD", | ||||
|         IGNORE_DATES=(datetime.date(2022, 4, 1),), | ||||
|     ) | ||||
|     def test_filename_date_ignored_use_content(self, *args): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - Date parsing from the filename is enabled | ||||
|             - Filename date format is with Day Month Year (YMD) | ||||
|             - Date order is Day Month Year (DMY, the default) | ||||
|             - Filename contains date matching the format | ||||
|             - Filename date is an ignored date | ||||
|             - File content includes a date | ||||
|  | ||||
|         THEN: | ||||
|             - Should parse the date from the content not filename | ||||
|         """ | ||||
|         self.assertEqual( | ||||
|             parse_date("/tmp/Scan-2022-04-01.pdf", "The matching date is 24.03.2022"), | ||||
|             datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), | ||||
|         ) | ||||
|  | ||||
|     @override_settings( | ||||
|         IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)), | ||||
|     ) | ||||
|     def test_ignored_dates(self, *args): | ||||
|     def test_ignored_dates_default_order(self, *args): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - Ignore dates have been set | ||||
|             - File content includes ignored dates | ||||
|             - File content includes 1 non-ignored date | ||||
|  | ||||
|         THEN: | ||||
|             - Should parse the date non-ignored date from content | ||||
|         """ | ||||
|         text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem " "ipsum" | ||||
|         date = parse_date("", text) | ||||
|         self.assertEqual( | ||||
|             date, | ||||
|             parse_date("", text), | ||||
|             datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), | ||||
|         ) | ||||
|  | ||||
|     @override_settings( | ||||
|         IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)), | ||||
|         DATE_ORDER="YMD", | ||||
|     ) | ||||
|     def test_ignored_dates_order_ymd(self, *args): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - Ignore dates have been set | ||||
|             - Date order is Year Month Date (YMD) | ||||
|             - File content includes ignored dates | ||||
|             - File content includes 1 non-ignored date | ||||
|  | ||||
|         THEN: | ||||
|             - Should parse the date non-ignored date from content | ||||
|         """ | ||||
|         text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem " "ipsum" | ||||
|  | ||||
|         self.assertEqual( | ||||
|             parse_date("", text), | ||||
|             datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), | ||||
|         ) | ||||
|   | ||||
| @@ -1,9 +1,11 @@ | ||||
| import datetime | ||||
| import json | ||||
| import math | ||||
| import multiprocessing | ||||
| import os | ||||
| import re | ||||
| from typing import Final | ||||
| from typing import Set | ||||
| from urllib.parse import urlparse | ||||
|  | ||||
| from concurrent_log_handler.queue import setup_logging_queues | ||||
| @@ -604,15 +606,22 @@ if PAPERLESS_TIKA_ENABLED: | ||||
|     INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig") | ||||
|  | ||||
| # List dates that should be ignored when trying to parse date from document text | ||||
| IGNORE_DATES = set() | ||||
| IGNORE_DATES: Set[datetime.date] = set() | ||||
|  | ||||
| if os.getenv("PAPERLESS_IGNORE_DATES", ""): | ||||
|  | ||||
| def _parse_ignore_dates(env_ignore: str) -> Set[datetime.datetime]: | ||||
|     import dateparser | ||||
|  | ||||
|     for s in os.getenv("PAPERLESS_IGNORE_DATES", "").split(","): | ||||
|     ignored_dates = set() | ||||
|     for s in env_ignore.split(","): | ||||
|         d = dateparser.parse(s) | ||||
|         if d: | ||||
|             IGNORE_DATES.add(d.date()) | ||||
|             ignored_dates.add(d.date()) | ||||
|     return ignored_dates | ||||
|  | ||||
|  | ||||
| if os.getenv("PAPERLESS_IGNORE_DATES") is not None: | ||||
|     IGNORE_DATES = _parse_ignore_dates(os.getenv("PAPERLESS_IGNORE_DATES")) | ||||
|  | ||||
| ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default") | ||||
| if ENABLE_UPDATE_CHECK != "default": | ||||
|   | ||||
							
								
								
									
										45
									
								
								src/paperless/tests/test_settings.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								src/paperless/tests/test_settings.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,45 @@ | ||||
| import datetime | ||||
| from unittest import TestCase | ||||
|  | ||||
| from paperless.settings import _parse_ignore_dates | ||||
|  | ||||
|  | ||||
| class TestIgnoreDateParsing(TestCase): | ||||
|     """ | ||||
|     Tests the parsing of the PAPERLESS_IGNORE_DATES setting value | ||||
|     """ | ||||
|  | ||||
|     def test_no_ignore_dates_set(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - No ignore dates are set | ||||
|         THEN: | ||||
|             - No ignore dates are parsed | ||||
|         """ | ||||
|         self.assertSetEqual(_parse_ignore_dates(""), set()) | ||||
|  | ||||
|     def test_single_ignore_dates_set(self): | ||||
|         """ | ||||
|         GIVEN: | ||||
|             - Ignore dates are set per certain inputs | ||||
|         THEN: | ||||
|             - All ignore dates are parsed | ||||
|         """ | ||||
|         test_cases = [ | ||||
|             ("1985-05-01", [datetime.date(1985, 5, 1)]), | ||||
|             ( | ||||
|                 "1985-05-01,1991-12-05", | ||||
|                 [datetime.date(1985, 5, 1), datetime.date(1991, 12, 5)], | ||||
|             ), | ||||
|             ("2010-12-13", [datetime.date(2010, 12, 13)]), | ||||
|         ] | ||||
|         for env_str, expected_dates in test_cases: | ||||
|             expected_date_set = set() | ||||
|  | ||||
|             for expected_date in expected_dates: | ||||
|                 expected_date_set.add(expected_date) | ||||
|  | ||||
|             self.assertSetEqual( | ||||
|                 _parse_ignore_dates(env_str), | ||||
|                 expected_date_set, | ||||
|             ) | ||||
		Reference in New Issue
	
	Block a user
	 Trenton Holmes
					Trenton Holmes