mirror of
				https://github.com/paperless-ngx/paperless-ngx.git
				synced 2025-10-30 03:56:23 -05:00 
			
		
		
		
	Merge pull request #721 from paperless-ngx/bug-fix-date-ignore
Fix Ignore Date Parsing
This commit is contained in:
		| @@ -650,7 +650,6 @@ PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT | |||||||
|  |  | ||||||
|   Defaults to "PATCHT" |   Defaults to "PATCHT" | ||||||
|  |  | ||||||
|  |  | ||||||
| PAPERLESS_CONVERT_MEMORY_LIMIT=<num> | PAPERLESS_CONVERT_MEMORY_LIMIT=<num> | ||||||
|     On smaller systems, or even in the case of Very Large Documents, the consumer |     On smaller systems, or even in the case of Very Large Documents, the consumer | ||||||
|     may explode, complaining about how it's "unable to extend pixel cache".  In |     may explode, complaining about how it's "unable to extend pixel cache".  In | ||||||
| @@ -696,6 +695,9 @@ PAPERLESS_FILENAME_DATE_ORDER=<format> | |||||||
|     The filename will be checked first, and if nothing is found, the document |     The filename will be checked first, and if nothing is found, the document | ||||||
|     text will be checked as normal. |     text will be checked as normal. | ||||||
|  |  | ||||||
|  |     A date in a filename must have some separators (`.`, `-`, `/`, etc) | ||||||
|  |     for it to be parsed. | ||||||
|  |  | ||||||
|     Defaults to none, which disables this feature. |     Defaults to none, which disables this feature. | ||||||
|  |  | ||||||
| PAPERLESS_THUMBNAIL_FONT_NAME=<filename> | PAPERLESS_THUMBNAIL_FONT_NAME=<filename> | ||||||
| @@ -713,10 +715,7 @@ PAPERLESS_IGNORE_DATES=<string> | |||||||
|     this process. This is useful for special dates (like date of birth) that appear |     this process. This is useful for special dates (like date of birth) that appear | ||||||
|     in documents regularly but are very unlikely to be the documents creation date. |     in documents regularly but are very unlikely to be the documents creation date. | ||||||
|  |  | ||||||
|     You may specify dates in a multitude of formats supported by dateparser (see |     The date is parsed using the order specified in PAPERLESS_DATE_ORDER | ||||||
|     https://dateparser.readthedocs.io/en/latest/#popular-formats) but as the dates |  | ||||||
|     need to be comma separated, the options are limited. |  | ||||||
|     Example: "2020-12-02,22.04.1999" |  | ||||||
|  |  | ||||||
|     Defaults to an empty string to not ignore any dates. |     Defaults to an empty string to not ignore any dates. | ||||||
|  |  | ||||||
|   | |||||||
| @@ -3,6 +3,8 @@ import hashlib | |||||||
| import os | import os | ||||||
| import uuid | import uuid | ||||||
| from subprocess import Popen | from subprocess import Popen | ||||||
|  | from typing import Optional | ||||||
|  | from typing import Type | ||||||
|  |  | ||||||
| import magic | import magic | ||||||
| from asgiref.sync import async_to_sync | from asgiref.sync import async_to_sync | ||||||
| @@ -23,6 +25,7 @@ from .models import Document | |||||||
| from .models import DocumentType | from .models import DocumentType | ||||||
| from .models import FileInfo | from .models import FileInfo | ||||||
| from .models import Tag | from .models import Tag | ||||||
|  | from .parsers import DocumentParser | ||||||
| from .parsers import get_parser_class_for_mime_type | from .parsers import get_parser_class_for_mime_type | ||||||
| from .parsers import parse_date | from .parsers import parse_date | ||||||
| from .parsers import ParseError | from .parsers import ParseError | ||||||
| @@ -186,7 +189,7 @@ class Consumer(LoggingMixin): | |||||||
|         override_document_type_id=None, |         override_document_type_id=None, | ||||||
|         override_tag_ids=None, |         override_tag_ids=None, | ||||||
|         task_id=None, |         task_id=None, | ||||||
|     ): |     ) -> Document: | ||||||
|         """ |         """ | ||||||
|         Return the document object if it was successfully created. |         Return the document object if it was successfully created. | ||||||
|         """ |         """ | ||||||
| @@ -220,7 +223,10 @@ class Consumer(LoggingMixin): | |||||||
|  |  | ||||||
|         self.log("debug", f"Detected mime type: {mime_type}") |         self.log("debug", f"Detected mime type: {mime_type}") | ||||||
|  |  | ||||||
|         parser_class = get_parser_class_for_mime_type(mime_type) |         # Based on the mime type, get the parser for that type | ||||||
|  |         parser_class: Optional[Type[DocumentParser]] = get_parser_class_for_mime_type( | ||||||
|  |             mime_type, | ||||||
|  |         ) | ||||||
|         if not parser_class: |         if not parser_class: | ||||||
|             self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}") |             self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}") | ||||||
|  |  | ||||||
| @@ -241,7 +247,10 @@ class Consumer(LoggingMixin): | |||||||
|  |  | ||||||
|         # This doesn't parse the document yet, but gives us a parser. |         # This doesn't parse the document yet, but gives us a parser. | ||||||
|  |  | ||||||
|         document_parser = parser_class(self.logging_group, progress_callback) |         document_parser: DocumentParser = parser_class( | ||||||
|  |             self.logging_group, | ||||||
|  |             progress_callback, | ||||||
|  |         ) | ||||||
|  |  | ||||||
|         self.log("debug", f"Parser: {type(document_parser).__name__}") |         self.log("debug", f"Parser: {type(document_parser).__name__}") | ||||||
|  |  | ||||||
| @@ -270,7 +279,7 @@ class Consumer(LoggingMixin): | |||||||
|  |  | ||||||
|             text = document_parser.get_text() |             text = document_parser.get_text() | ||||||
|             date = document_parser.get_date() |             date = document_parser.get_date() | ||||||
|             if not date: |             if date is None: | ||||||
|                 self._send_progress(90, 100, "WORKING", MESSAGE_PARSE_DATE) |                 self._send_progress(90, 100, "WORKING", MESSAGE_PARSE_DATE) | ||||||
|                 date = parse_date(self.filename, text) |                 date = parse_date(self.filename, text) | ||||||
|             archive_path = document_parser.get_archive_path() |             archive_path = document_parser.get_archive_path() | ||||||
| @@ -342,7 +351,7 @@ class Consumer(LoggingMixin): | |||||||
|                             ).hexdigest() |                             ).hexdigest() | ||||||
|  |  | ||||||
|                 # Don't save with the lock active. Saving will cause the file |                 # Don't save with the lock active. Saving will cause the file | ||||||
|                 # renaming logic to aquire the lock as well. |                 # renaming logic to acquire the lock as well. | ||||||
|                 document.save() |                 document.save() | ||||||
|  |  | ||||||
|                 # Delete the file only if it was successfully consumed |                 # Delete the file only if it was successfully consumed | ||||||
| @@ -362,7 +371,8 @@ class Consumer(LoggingMixin): | |||||||
|         except Exception as e: |         except Exception as e: | ||||||
|             self._fail( |             self._fail( | ||||||
|                 str(e), |                 str(e), | ||||||
|                 f"The following error occured while consuming " f"{self.filename}: {e}", |                 f"The following error occurred while consuming " | ||||||
|  |                 f"{self.filename}: {e}", | ||||||
|                 exc_info=True, |                 exc_info=True, | ||||||
|             ) |             ) | ||||||
|         finally: |         finally: | ||||||
| @@ -376,21 +386,26 @@ class Consumer(LoggingMixin): | |||||||
|  |  | ||||||
|         return document |         return document | ||||||
|  |  | ||||||
|     def _store(self, text, date, mime_type): |     def _store(self, text, date, mime_type) -> Document: | ||||||
|  |  | ||||||
|         # If someone gave us the original filename, use it instead of doc. |         # If someone gave us the original filename, use it instead of doc. | ||||||
|  |  | ||||||
|         file_info = FileInfo.from_filename(self.filename) |         file_info = FileInfo.from_filename(self.filename) | ||||||
|  |  | ||||||
|         stats = os.stat(self.path) |  | ||||||
|  |  | ||||||
|         self.log("debug", "Saving record to database") |         self.log("debug", "Saving record to database") | ||||||
|  |  | ||||||
|         created = ( |         if file_info.created is not None: | ||||||
|             file_info.created |             create_date = file_info.created | ||||||
|             or date |             self.log("debug", f"Creation date from FileInfo: {create_date}") | ||||||
|             or timezone.make_aware(datetime.datetime.fromtimestamp(stats.st_mtime)) |         elif date is not None: | ||||||
|  |             create_date = date | ||||||
|  |             self.log("debug", f"Creation date from parse_date: {create_date}") | ||||||
|  |         else: | ||||||
|  |             stats = os.stat(self.path) | ||||||
|  |             create_date = timezone.make_aware( | ||||||
|  |                 datetime.datetime.fromtimestamp(stats.st_mtime), | ||||||
|             ) |             ) | ||||||
|  |             self.log("debug", f"Creation date from st_mtime: {create_date}") | ||||||
|  |  | ||||||
|         storage_type = Document.STORAGE_TYPE_UNENCRYPTED |         storage_type = Document.STORAGE_TYPE_UNENCRYPTED | ||||||
|  |  | ||||||
| @@ -400,8 +415,8 @@ class Consumer(LoggingMixin): | |||||||
|                 content=text, |                 content=text, | ||||||
|                 mime_type=mime_type, |                 mime_type=mime_type, | ||||||
|                 checksum=hashlib.md5(f.read()).hexdigest(), |                 checksum=hashlib.md5(f.read()).hexdigest(), | ||||||
|                 created=created, |                 created=create_date, | ||||||
|                 modified=created, |                 modified=create_date, | ||||||
|                 storage_type=storage_type, |                 storage_type=storage_type, | ||||||
|             ) |             ) | ||||||
|  |  | ||||||
|   | |||||||
| @@ -379,6 +379,10 @@ class SavedViewFilterRule(models.Model): | |||||||
|  |  | ||||||
|  |  | ||||||
| # TODO: why is this in the models file? | # TODO: why is this in the models file? | ||||||
|  | # TODO: how about, what is this and where is it documented? | ||||||
|  | # It appears to parsing JSON from an environment variable to get a title and date from | ||||||
|  | # the filename, if possible, as a higher priority than either document filename or | ||||||
|  | # content parsing | ||||||
| class FileInfo: | class FileInfo: | ||||||
|  |  | ||||||
|     REGEXES = OrderedDict( |     REGEXES = OrderedDict( | ||||||
| @@ -386,8 +390,7 @@ class FileInfo: | |||||||
|             ( |             ( | ||||||
|                 "created-title", |                 "created-title", | ||||||
|                 re.compile( |                 re.compile( | ||||||
|                     r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " |                     r"^(?P<created>\d{8}(\d{6})?Z) - " r"(?P<title>.*)$", | ||||||
|                     r"(?P<title>.*)$", |  | ||||||
|                     flags=re.IGNORECASE, |                     flags=re.IGNORECASE, | ||||||
|                 ), |                 ), | ||||||
|             ), |             ), | ||||||
| @@ -427,7 +430,7 @@ class FileInfo: | |||||||
|             properties[name] = getattr(cls, f"_get_{name}")(properties[name]) |             properties[name] = getattr(cls, f"_get_{name}")(properties[name]) | ||||||
|  |  | ||||||
|     @classmethod |     @classmethod | ||||||
|     def from_filename(cls, filename): |     def from_filename(cls, filename) -> "FileInfo": | ||||||
|         # Mutate filename in-place before parsing its components |         # Mutate filename in-place before parsing its components | ||||||
|         # by applying at most one of the configured transformations. |         # by applying at most one of the configured transformations. | ||||||
|         for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS: |         for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS: | ||||||
|   | |||||||
| @@ -1,3 +1,4 @@ | |||||||
|  | import datetime | ||||||
| import logging | import logging | ||||||
| import mimetypes | import mimetypes | ||||||
| import os | import os | ||||||
| @@ -5,6 +6,8 @@ import re | |||||||
| import shutil | import shutil | ||||||
| import subprocess | import subprocess | ||||||
| import tempfile | import tempfile | ||||||
|  | from typing import Optional | ||||||
|  | from typing import Set | ||||||
|  |  | ||||||
| import magic | import magic | ||||||
| from django.conf import settings | from django.conf import settings | ||||||
| @@ -40,11 +43,11 @@ DATE_REGEX = re.compile( | |||||||
| logger = logging.getLogger("paperless.parsing") | logger = logging.getLogger("paperless.parsing") | ||||||
|  |  | ||||||
|  |  | ||||||
| def is_mime_type_supported(mime_type): | def is_mime_type_supported(mime_type) -> bool: | ||||||
|     return get_parser_class_for_mime_type(mime_type) is not None |     return get_parser_class_for_mime_type(mime_type) is not None | ||||||
|  |  | ||||||
|  |  | ||||||
| def get_default_file_extension(mime_type): | def get_default_file_extension(mime_type) -> str: | ||||||
|     for response in document_consumer_declaration.send(None): |     for response in document_consumer_declaration.send(None): | ||||||
|         parser_declaration = response[1] |         parser_declaration = response[1] | ||||||
|         supported_mime_types = parser_declaration["mime_types"] |         supported_mime_types = parser_declaration["mime_types"] | ||||||
| @@ -59,14 +62,14 @@ def get_default_file_extension(mime_type): | |||||||
|         return "" |         return "" | ||||||
|  |  | ||||||
|  |  | ||||||
| def is_file_ext_supported(ext): | def is_file_ext_supported(ext) -> bool: | ||||||
|     if ext: |     if ext: | ||||||
|         return ext.lower() in get_supported_file_extensions() |         return ext.lower() in get_supported_file_extensions() | ||||||
|     else: |     else: | ||||||
|         return False |         return False | ||||||
|  |  | ||||||
|  |  | ||||||
| def get_supported_file_extensions(): | def get_supported_file_extensions() -> Set[str]: | ||||||
|     extensions = set() |     extensions = set() | ||||||
|     for response in document_consumer_declaration.send(None): |     for response in document_consumer_declaration.send(None): | ||||||
|         parser_declaration = response[1] |         parser_declaration = response[1] | ||||||
| @@ -121,7 +124,7 @@ def run_convert( | |||||||
|     auto_orient=False, |     auto_orient=False, | ||||||
|     extra=None, |     extra=None, | ||||||
|     logging_group=None, |     logging_group=None, | ||||||
| ): | ) -> None: | ||||||
|  |  | ||||||
|     environment = os.environ.copy() |     environment = os.environ.copy() | ||||||
|     if settings.CONVERT_MEMORY_LIMIT: |     if settings.CONVERT_MEMORY_LIMIT: | ||||||
| @@ -146,11 +149,11 @@ def run_convert( | |||||||
|         raise ParseError(f"Convert failed at {args}") |         raise ParseError(f"Convert failed at {args}") | ||||||
|  |  | ||||||
|  |  | ||||||
| def get_default_thumbnail(): | def get_default_thumbnail() -> str: | ||||||
|     return os.path.join(os.path.dirname(__file__), "resources", "document.png") |     return os.path.join(os.path.dirname(__file__), "resources", "document.png") | ||||||
|  |  | ||||||
|  |  | ||||||
| def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None): | def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> str: | ||||||
|     out_path = os.path.join(temp_dir, "convert_gs.png") |     out_path = os.path.join(temp_dir, "convert_gs.png") | ||||||
|  |  | ||||||
|     # if convert fails, fall back to extracting |     # if convert fails, fall back to extracting | ||||||
| @@ -184,7 +187,7 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None): | |||||||
|         return get_default_thumbnail() |         return get_default_thumbnail() | ||||||
|  |  | ||||||
|  |  | ||||||
| def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None): | def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str: | ||||||
|     """ |     """ | ||||||
|     The thumbnail of a PDF is just a 500px wide image of the first page. |     The thumbnail of a PDF is just a 500px wide image of the first page. | ||||||
|     """ |     """ | ||||||
| @@ -209,12 +212,12 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None): | |||||||
|     return out_path |     return out_path | ||||||
|  |  | ||||||
|  |  | ||||||
| def parse_date(filename, text): | def parse_date(filename, text) -> Optional[datetime.datetime]: | ||||||
|     """ |     """ | ||||||
|     Returns the date of the document. |     Returns the date of the document. | ||||||
|     """ |     """ | ||||||
|  |  | ||||||
|     def __parser(ds, date_order): |     def __parser(ds: str, date_order: str) -> datetime.datetime: | ||||||
|         """ |         """ | ||||||
|         Call dateparser.parse with a particular date ordering |         Call dateparser.parse with a particular date ordering | ||||||
|         """ |         """ | ||||||
| @@ -230,9 +233,9 @@ def parse_date(filename, text): | |||||||
|             }, |             }, | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|     def __filter(date): |     def __filter(date: datetime.datetime) -> Optional[datetime.datetime]: | ||||||
|         if ( |         if ( | ||||||
|             date |             date is not None | ||||||
|             and date.year > 1900 |             and date.year > 1900 | ||||||
|             and date <= timezone.now() |             and date <= timezone.now() | ||||||
|             and date.date() not in settings.IGNORE_DATES |             and date.date() not in settings.IGNORE_DATES | ||||||
| @@ -269,7 +272,7 @@ def parse_date(filename, text): | |||||||
|  |  | ||||||
|         date = __filter(date) |         date = __filter(date) | ||||||
|         if date is not None: |         if date is not None: | ||||||
|             break |             return date | ||||||
|  |  | ||||||
|     return date |     return date | ||||||
|  |  | ||||||
| @@ -294,7 +297,7 @@ class DocumentParser(LoggingMixin): | |||||||
|  |  | ||||||
|         self.archive_path = None |         self.archive_path = None | ||||||
|         self.text = None |         self.text = None | ||||||
|         self.date = None |         self.date: Optional[datetime.datetime] = None | ||||||
|         self.progress_callback = progress_callback |         self.progress_callback = progress_callback | ||||||
|  |  | ||||||
|     def progress(self, current_progress, max_progress): |     def progress(self, current_progress, max_progress): | ||||||
| @@ -342,7 +345,7 @@ class DocumentParser(LoggingMixin): | |||||||
|     def get_text(self): |     def get_text(self): | ||||||
|         return self.text |         return self.text | ||||||
|  |  | ||||||
|     def get_date(self): |     def get_date(self) -> Optional[datetime.datetime]: | ||||||
|         return self.date |         return self.date | ||||||
|  |  | ||||||
|     def cleanup(self): |     def cleanup(self): | ||||||
|   | |||||||
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/documents/originals/0000005.pdf
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/documents/originals/0000005.pdf
									
									
									
									
									
										Executable file
									
								
							
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										
											BIN
										
									
								
								src/documents/tests/samples/documents/originals/0000006.pdf
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										
											BIN
										
									
								
								src/documents/tests/samples/documents/originals/0000006.pdf
									
									
									
									
									
										Executable file
									
								
							
										
											Binary file not shown.
										
									
								
							| @@ -1,3 +1,4 @@ | |||||||
|  | import datetime | ||||||
| import os | import os | ||||||
| import re | import re | ||||||
| import shutil | import shutil | ||||||
| @@ -5,6 +6,8 @@ import tempfile | |||||||
| from unittest import mock | from unittest import mock | ||||||
| from unittest.mock import MagicMock | from unittest.mock import MagicMock | ||||||
|  |  | ||||||
|  | from dateutil import tz | ||||||
|  |  | ||||||
| try: | try: | ||||||
|     import zoneinfo |     import zoneinfo | ||||||
| except ImportError: | except ImportError: | ||||||
| @@ -502,7 +505,7 @@ class TestConsumer(DirectoriesMixin, TestCase): | |||||||
|  |  | ||||||
|         self.assertRaisesMessage( |         self.assertRaisesMessage( | ||||||
|             ConsumerError, |             ConsumerError, | ||||||
|             "sample.pdf: The following error occured while consuming sample.pdf: NO.", |             "sample.pdf: The following error occurred while consuming sample.pdf: NO.", | ||||||
|             self.consumer.try_consume_file, |             self.consumer.try_consume_file, | ||||||
|             filename, |             filename, | ||||||
|         ) |         ) | ||||||
| @@ -654,6 +657,127 @@ class TestConsumer(DirectoriesMixin, TestCase): | |||||||
|         sanity_check() |         sanity_check() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | @mock.patch("documents.consumer.magic.from_file", fake_magic_from_file) | ||||||
|  | class TestConsumerCreatedDate(DirectoriesMixin, TestCase): | ||||||
|  |     def setUp(self): | ||||||
|  |         super(TestConsumerCreatedDate, self).setUp() | ||||||
|  |  | ||||||
|  |         # this prevents websocket message reports during testing. | ||||||
|  |         patcher = mock.patch("documents.consumer.Consumer._send_progress") | ||||||
|  |         self._send_progress = patcher.start() | ||||||
|  |         self.addCleanup(patcher.stop) | ||||||
|  |  | ||||||
|  |         self.consumer = Consumer() | ||||||
|  |  | ||||||
|  |     def test_consume_date_from_content(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - File content with date in DMY (default) format | ||||||
|  |  | ||||||
|  |         THEN: | ||||||
|  |             - Should parse the date from the file content | ||||||
|  |         """ | ||||||
|  |         src = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "documents", | ||||||
|  |             "originals", | ||||||
|  |             "0000005.pdf", | ||||||
|  |         ) | ||||||
|  |         dst = os.path.join(self.dirs.scratch_dir, "sample.pdf") | ||||||
|  |         shutil.copy(src, dst) | ||||||
|  |  | ||||||
|  |         document = self.consumer.try_consume_file(dst) | ||||||
|  |  | ||||||
|  |         self.assertEqual( | ||||||
|  |             document.created, | ||||||
|  |             datetime.datetime(1996, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)), | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     @override_settings(FILENAME_DATE_ORDER="YMD") | ||||||
|  |     def test_consume_date_from_filename(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - File content with date in DMY (default) format | ||||||
|  |             - Filename with date in YMD format | ||||||
|  |  | ||||||
|  |         THEN: | ||||||
|  |             - Should parse the date from the filename | ||||||
|  |         """ | ||||||
|  |         src = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "documents", | ||||||
|  |             "originals", | ||||||
|  |             "0000005.pdf", | ||||||
|  |         ) | ||||||
|  |         dst = os.path.join(self.dirs.scratch_dir, "Scan - 2022-02-01.pdf") | ||||||
|  |         shutil.copy(src, dst) | ||||||
|  |  | ||||||
|  |         document = self.consumer.try_consume_file(dst) | ||||||
|  |  | ||||||
|  |         self.assertEqual( | ||||||
|  |             document.created, | ||||||
|  |             datetime.datetime(2022, 2, 1, tzinfo=tz.gettz(settings.TIME_ZONE)), | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     def test_consume_date_filename_date_use_content(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - File content with date in DMY (default) format | ||||||
|  |             - Filename date parsing disabled | ||||||
|  |             - Filename with date in YMD format | ||||||
|  |  | ||||||
|  |         THEN: | ||||||
|  |             - Should parse the date from the content | ||||||
|  |         """ | ||||||
|  |         src = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "documents", | ||||||
|  |             "originals", | ||||||
|  |             "0000005.pdf", | ||||||
|  |         ) | ||||||
|  |         dst = os.path.join(self.dirs.scratch_dir, "Scan - 2022-02-01.pdf") | ||||||
|  |         shutil.copy(src, dst) | ||||||
|  |  | ||||||
|  |         document = self.consumer.try_consume_file(dst) | ||||||
|  |  | ||||||
|  |         self.assertEqual( | ||||||
|  |             document.created, | ||||||
|  |             datetime.datetime(1996, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)), | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     @override_settings( | ||||||
|  |         IGNORE_DATES=(datetime.date(2010, 12, 13), datetime.date(2011, 11, 12)), | ||||||
|  |     ) | ||||||
|  |     def test_consume_date_use_content_with_ignore(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - File content with dates in DMY (default) format | ||||||
|  |             - File content includes ignored dates | ||||||
|  |  | ||||||
|  |         THEN: | ||||||
|  |             - Should parse the date from the filename | ||||||
|  |         """ | ||||||
|  |         src = os.path.join( | ||||||
|  |             os.path.dirname(__file__), | ||||||
|  |             "samples", | ||||||
|  |             "documents", | ||||||
|  |             "originals", | ||||||
|  |             "0000006.pdf", | ||||||
|  |         ) | ||||||
|  |         dst = os.path.join(self.dirs.scratch_dir, "0000006.pdf") | ||||||
|  |         shutil.copy(src, dst) | ||||||
|  |  | ||||||
|  |         document = self.consumer.try_consume_file(dst) | ||||||
|  |  | ||||||
|  |         self.assertEqual( | ||||||
|  |             document.created, | ||||||
|  |             datetime.datetime(1997, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)), | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |  | ||||||
| class PreConsumeTestCase(TestCase): | class PreConsumeTestCase(TestCase): | ||||||
|     @mock.patch("documents.consumer.Popen") |     @mock.patch("documents.consumer.Popen") | ||||||
|     @override_settings(PRE_CONSUME_SCRIPT=None) |     @override_settings(PRE_CONSUME_SCRIPT=None) | ||||||
|   | |||||||
| @@ -8,6 +8,7 @@ from django.conf import settings | |||||||
| from django.test import override_settings | from django.test import override_settings | ||||||
| from django.test import TestCase | from django.test import TestCase | ||||||
| from documents.parsers import parse_date | from documents.parsers import parse_date | ||||||
|  | from paperless.settings import DATE_ORDER | ||||||
|  |  | ||||||
|  |  | ||||||
| class TestDate(TestCase): | class TestDate(TestCase): | ||||||
| @@ -160,19 +161,112 @@ class TestDate(TestCase): | |||||||
|     def test_crazy_date_with_spaces(self, *args): |     def test_crazy_date_with_spaces(self, *args): | ||||||
|         self.assertIsNone(parse_date("", "20 408000l 2475")) |         self.assertIsNone(parse_date("", "20 408000l 2475")) | ||||||
|  |  | ||||||
|  |     @override_settings(FILENAME_DATE_ORDER="YMD") | ||||||
|  |     def test_filename_date_parse_valid_ymd(self, *args): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - Date parsing from the filename is enabled | ||||||
|  |             - Filename date format is with Year Month Day (YMD) | ||||||
|  |             - Filename contains date matching the format | ||||||
|  |  | ||||||
|  |         THEN: | ||||||
|  |             - Should parse the date from the filename | ||||||
|  |         """ | ||||||
|  |         self.assertEqual( | ||||||
|  |             parse_date("/tmp/Scan-2022-04-01.pdf", "No date in here"), | ||||||
|  |             datetime.datetime(2022, 4, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     @override_settings(FILENAME_DATE_ORDER="DMY") | ||||||
|  |     def test_filename_date_parse_valid_dmy(self, *args): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - Date parsing from the filename is enabled | ||||||
|  |             - Filename date format is with Day Month Year (DMY) | ||||||
|  |             - Filename contains date matching the format | ||||||
|  |  | ||||||
|  |         THEN: | ||||||
|  |             - Should parse the date from the filename | ||||||
|  |         """ | ||||||
|  |         self.assertEqual( | ||||||
|  |             parse_date("/tmp/Scan-10.01.2021.pdf", "No date in here"), | ||||||
|  |             datetime.datetime(2021, 1, 10, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), | ||||||
|  |         ) | ||||||
|  |  | ||||||
|     @override_settings(FILENAME_DATE_ORDER="YMD") |     @override_settings(FILENAME_DATE_ORDER="YMD") | ||||||
|     def test_filename_date_parse_invalid(self, *args): |     def test_filename_date_parse_invalid(self, *args): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - Date parsing from the filename is enabled | ||||||
|  |             - Filename includes no date | ||||||
|  |             - File content includes no date | ||||||
|  |  | ||||||
|  |         THEN: | ||||||
|  |             - No date is parsed | ||||||
|  |         """ | ||||||
|         self.assertIsNone( |         self.assertIsNone( | ||||||
|             parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"), |             parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"), | ||||||
|         ) |         ) | ||||||
|  |  | ||||||
|  |     @override_settings( | ||||||
|  |         FILENAME_DATE_ORDER="YMD", | ||||||
|  |         IGNORE_DATES=(datetime.date(2022, 4, 1),), | ||||||
|  |     ) | ||||||
|  |     def test_filename_date_ignored_use_content(self, *args): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - Date parsing from the filename is enabled | ||||||
|  |             - Filename date format is with Day Month Year (YMD) | ||||||
|  |             - Date order is Day Month Year (DMY, the default) | ||||||
|  |             - Filename contains date matching the format | ||||||
|  |             - Filename date is an ignored date | ||||||
|  |             - File content includes a date | ||||||
|  |  | ||||||
|  |         THEN: | ||||||
|  |             - Should parse the date from the content not filename | ||||||
|  |         """ | ||||||
|  |         self.assertEqual( | ||||||
|  |             parse_date("/tmp/Scan-2022-04-01.pdf", "The matching date is 24.03.2022"), | ||||||
|  |             datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), | ||||||
|  |         ) | ||||||
|  |  | ||||||
|     @override_settings( |     @override_settings( | ||||||
|         IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)), |         IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)), | ||||||
|     ) |     ) | ||||||
|     def test_ignored_dates(self, *args): |     def test_ignored_dates_default_order(self, *args): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - Ignore dates have been set | ||||||
|  |             - File content includes ignored dates | ||||||
|  |             - File content includes 1 non-ignored date | ||||||
|  |  | ||||||
|  |         THEN: | ||||||
|  |             - Should parse the date non-ignored date from content | ||||||
|  |         """ | ||||||
|         text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem " "ipsum" |         text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem " "ipsum" | ||||||
|         date = parse_date("", text) |  | ||||||
|         self.assertEqual( |         self.assertEqual( | ||||||
|             date, |             parse_date("", text), | ||||||
|  |             datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), | ||||||
|  |         ) | ||||||
|  |  | ||||||
|  |     @override_settings( | ||||||
|  |         IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)), | ||||||
|  |         DATE_ORDER="YMD", | ||||||
|  |     ) | ||||||
|  |     def test_ignored_dates_order_ymd(self, *args): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - Ignore dates have been set | ||||||
|  |             - Date order is Year Month Date (YMD) | ||||||
|  |             - File content includes ignored dates | ||||||
|  |             - File content includes 1 non-ignored date | ||||||
|  |  | ||||||
|  |         THEN: | ||||||
|  |             - Should parse the date non-ignored date from content | ||||||
|  |         """ | ||||||
|  |         text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem " "ipsum" | ||||||
|  |  | ||||||
|  |         self.assertEqual( | ||||||
|  |             parse_date("", text), | ||||||
|             datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), |             datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), | ||||||
|         ) |         ) | ||||||
|   | |||||||
| @@ -1,9 +1,11 @@ | |||||||
|  | import datetime | ||||||
| import json | import json | ||||||
| import math | import math | ||||||
| import multiprocessing | import multiprocessing | ||||||
| import os | import os | ||||||
| import re | import re | ||||||
| from typing import Final | from typing import Final | ||||||
|  | from typing import Set | ||||||
| from urllib.parse import urlparse | from urllib.parse import urlparse | ||||||
|  |  | ||||||
| from concurrent_log_handler.queue import setup_logging_queues | from concurrent_log_handler.queue import setup_logging_queues | ||||||
| @@ -603,16 +605,42 @@ PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv( | |||||||
| if PAPERLESS_TIKA_ENABLED: | if PAPERLESS_TIKA_ENABLED: | ||||||
|     INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig") |     INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig") | ||||||
|  |  | ||||||
| # List dates that should be ignored when trying to parse date from document text |  | ||||||
| IGNORE_DATES = set() |  | ||||||
|  |  | ||||||
| if os.getenv("PAPERLESS_IGNORE_DATES", ""): | def _parse_ignore_dates( | ||||||
|  |     env_ignore: str, | ||||||
|  |     date_order: str = DATE_ORDER, | ||||||
|  | ) -> Set[datetime.datetime]: | ||||||
|  |     """ | ||||||
|  |     If the PAPERLESS_IGNORE_DATES environment variable is set, parse the | ||||||
|  |     user provided string(s) into dates | ||||||
|  |  | ||||||
|  |     Args: | ||||||
|  |         env_ignore (str): The value of the environment variable, comma seperated dates | ||||||
|  |         date_order (str, optional): The format of the date strings. Defaults to DATE_ORDER. | ||||||
|  |  | ||||||
|  |     Returns: | ||||||
|  |         Set[datetime.datetime]: The set of parsed date objects | ||||||
|  |     """ | ||||||
|     import dateparser |     import dateparser | ||||||
|  |  | ||||||
|     for s in os.getenv("PAPERLESS_IGNORE_DATES", "").split(","): |     ignored_dates = set() | ||||||
|         d = dateparser.parse(s) |     for s in env_ignore.split(","): | ||||||
|  |         d = dateparser.parse( | ||||||
|  |             s, | ||||||
|  |             settings={ | ||||||
|  |                 "DATE_ORDER": date_order, | ||||||
|  |             }, | ||||||
|  |         ) | ||||||
|         if d: |         if d: | ||||||
|             IGNORE_DATES.add(d.date()) |             ignored_dates.add(d.date()) | ||||||
|  |     return ignored_dates | ||||||
|  |  | ||||||
|  |  | ||||||
|  | # List dates that should be ignored when trying to parse date from document text | ||||||
|  | IGNORE_DATES: Set[datetime.date] = set() | ||||||
|  |  | ||||||
|  | if os.getenv("PAPERLESS_IGNORE_DATES") is not None: | ||||||
|  |     IGNORE_DATES = _parse_ignore_dates(os.getenv("PAPERLESS_IGNORE_DATES")) | ||||||
|  |  | ||||||
| ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default") | ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default") | ||||||
| if ENABLE_UPDATE_CHECK != "default": | if ENABLE_UPDATE_CHECK != "default": | ||||||
|   | |||||||
							
								
								
									
										58
									
								
								src/paperless/tests/test_settings.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										58
									
								
								src/paperless/tests/test_settings.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,58 @@ | |||||||
|  | import datetime | ||||||
|  | from unittest import TestCase | ||||||
|  |  | ||||||
|  | from paperless.settings import _parse_ignore_dates | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class TestIgnoreDateParsing(TestCase): | ||||||
|  |     """ | ||||||
|  |     Tests the parsing of the PAPERLESS_IGNORE_DATES setting value | ||||||
|  |     """ | ||||||
|  |  | ||||||
|  |     def _parse_checker(self, test_cases): | ||||||
|  |         """ | ||||||
|  |         Helper function to check ignore date parsing | ||||||
|  |  | ||||||
|  |         Args: | ||||||
|  |             test_cases (_type_): _description_ | ||||||
|  |         """ | ||||||
|  |         for env_str, date_format, expected_date_set in test_cases: | ||||||
|  |  | ||||||
|  |             self.assertSetEqual( | ||||||
|  |                 _parse_ignore_dates(env_str, date_format), | ||||||
|  |                 expected_date_set, | ||||||
|  |             ) | ||||||
|  |  | ||||||
|  |     def test_no_ignore_dates_set(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - No ignore dates are set | ||||||
|  |         THEN: | ||||||
|  |             - No ignore dates are parsed | ||||||
|  |         """ | ||||||
|  |         self.assertSetEqual(_parse_ignore_dates(""), set()) | ||||||
|  |  | ||||||
|  |     def test_single_ignore_dates_set(self): | ||||||
|  |         """ | ||||||
|  |         GIVEN: | ||||||
|  |             - Ignore dates are set per certain inputs | ||||||
|  |         THEN: | ||||||
|  |             - All ignore dates are parsed | ||||||
|  |         """ | ||||||
|  |         test_cases = [ | ||||||
|  |             ("1985-05-01", "YMD", {datetime.date(1985, 5, 1)}), | ||||||
|  |             ( | ||||||
|  |                 "1985-05-01,1991-12-05", | ||||||
|  |                 "YMD", | ||||||
|  |                 {datetime.date(1985, 5, 1), datetime.date(1991, 12, 5)}, | ||||||
|  |             ), | ||||||
|  |             ("2010-12-13", "YMD", {datetime.date(2010, 12, 13)}), | ||||||
|  |             ("11.01.10", "DMY", {datetime.date(2010, 1, 11)}), | ||||||
|  |             ( | ||||||
|  |                 "11.01.2001,15-06-1996", | ||||||
|  |                 "DMY", | ||||||
|  |                 {datetime.date(2001, 1, 11), datetime.date(1996, 6, 15)}, | ||||||
|  |             ), | ||||||
|  |         ] | ||||||
|  |  | ||||||
|  |         self._parse_checker(test_cases) | ||||||
		Reference in New Issue
	
	Block a user
	 shamoon
					shamoon