Merge pull request #721 from paperless-ngx/bug-fix-date-ignore

Fix Ignore Date Parsing
This commit is contained in:
shamoon 2022-05-10 16:45:58 -07:00 committed by GitHub
commit 536576518e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 373 additions and 49 deletions

View File

@ -650,7 +650,6 @@ PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT
Defaults to "PATCHT" Defaults to "PATCHT"
PAPERLESS_CONVERT_MEMORY_LIMIT=<num> PAPERLESS_CONVERT_MEMORY_LIMIT=<num>
On smaller systems, or even in the case of Very Large Documents, the consumer On smaller systems, or even in the case of Very Large Documents, the consumer
may explode, complaining about how it's "unable to extend pixel cache". In may explode, complaining about how it's "unable to extend pixel cache". In
@ -696,6 +695,9 @@ PAPERLESS_FILENAME_DATE_ORDER=<format>
The filename will be checked first, and if nothing is found, the document The filename will be checked first, and if nothing is found, the document
text will be checked as normal. text will be checked as normal.
A date in a filename must have some separators (`.`, `-`, `/`, etc)
for it to be parsed.
Defaults to none, which disables this feature. Defaults to none, which disables this feature.
PAPERLESS_THUMBNAIL_FONT_NAME=<filename> PAPERLESS_THUMBNAIL_FONT_NAME=<filename>
@ -713,10 +715,7 @@ PAPERLESS_IGNORE_DATES=<string>
this process. This is useful for special dates (like date of birth) that appear this process. This is useful for special dates (like date of birth) that appear
in documents regularly but are very unlikely to be the documents creation date. in documents regularly but are very unlikely to be the documents creation date.
You may specify dates in a multitude of formats supported by dateparser (see The date is parsed using the order specified in PAPERLESS_DATE_ORDER
https://dateparser.readthedocs.io/en/latest/#popular-formats) but as the dates
need to be comma separated, the options are limited.
Example: "2020-12-02,22.04.1999"
Defaults to an empty string to not ignore any dates. Defaults to an empty string to not ignore any dates.

View File

@ -3,6 +3,8 @@ import hashlib
import os import os
import uuid import uuid
from subprocess import Popen from subprocess import Popen
from typing import Optional
from typing import Type
import magic import magic
from asgiref.sync import async_to_sync from asgiref.sync import async_to_sync
@ -23,6 +25,7 @@ from .models import Document
from .models import DocumentType from .models import DocumentType
from .models import FileInfo from .models import FileInfo
from .models import Tag from .models import Tag
from .parsers import DocumentParser
from .parsers import get_parser_class_for_mime_type from .parsers import get_parser_class_for_mime_type
from .parsers import parse_date from .parsers import parse_date
from .parsers import ParseError from .parsers import ParseError
@ -186,7 +189,7 @@ class Consumer(LoggingMixin):
override_document_type_id=None, override_document_type_id=None,
override_tag_ids=None, override_tag_ids=None,
task_id=None, task_id=None,
): ) -> Document:
""" """
Return the document object if it was successfully created. Return the document object if it was successfully created.
""" """
@ -220,7 +223,10 @@ class Consumer(LoggingMixin):
self.log("debug", f"Detected mime type: {mime_type}") self.log("debug", f"Detected mime type: {mime_type}")
parser_class = get_parser_class_for_mime_type(mime_type) # Based on the mime type, get the parser for that type
parser_class: Optional[Type[DocumentParser]] = get_parser_class_for_mime_type(
mime_type,
)
if not parser_class: if not parser_class:
self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}") self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}")
@ -241,7 +247,10 @@ class Consumer(LoggingMixin):
# This doesn't parse the document yet, but gives us a parser. # This doesn't parse the document yet, but gives us a parser.
document_parser = parser_class(self.logging_group, progress_callback) document_parser: DocumentParser = parser_class(
self.logging_group,
progress_callback,
)
self.log("debug", f"Parser: {type(document_parser).__name__}") self.log("debug", f"Parser: {type(document_parser).__name__}")
@ -270,7 +279,7 @@ class Consumer(LoggingMixin):
text = document_parser.get_text() text = document_parser.get_text()
date = document_parser.get_date() date = document_parser.get_date()
if not date: if date is None:
self._send_progress(90, 100, "WORKING", MESSAGE_PARSE_DATE) self._send_progress(90, 100, "WORKING", MESSAGE_PARSE_DATE)
date = parse_date(self.filename, text) date = parse_date(self.filename, text)
archive_path = document_parser.get_archive_path() archive_path = document_parser.get_archive_path()
@ -342,7 +351,7 @@ class Consumer(LoggingMixin):
).hexdigest() ).hexdigest()
# Don't save with the lock active. Saving will cause the file # Don't save with the lock active. Saving will cause the file
# renaming logic to aquire the lock as well. # renaming logic to acquire the lock as well.
document.save() document.save()
# Delete the file only if it was successfully consumed # Delete the file only if it was successfully consumed
@ -362,7 +371,8 @@ class Consumer(LoggingMixin):
except Exception as e: except Exception as e:
self._fail( self._fail(
str(e), str(e),
f"The following error occured while consuming " f"{self.filename}: {e}", f"The following error occurred while consuming "
f"{self.filename}: {e}",
exc_info=True, exc_info=True,
) )
finally: finally:
@ -376,21 +386,26 @@ class Consumer(LoggingMixin):
return document return document
def _store(self, text, date, mime_type): def _store(self, text, date, mime_type) -> Document:
# If someone gave us the original filename, use it instead of doc. # If someone gave us the original filename, use it instead of doc.
file_info = FileInfo.from_filename(self.filename) file_info = FileInfo.from_filename(self.filename)
stats = os.stat(self.path)
self.log("debug", "Saving record to database") self.log("debug", "Saving record to database")
created = ( if file_info.created is not None:
file_info.created create_date = file_info.created
or date self.log("debug", f"Creation date from FileInfo: {create_date}")
or timezone.make_aware(datetime.datetime.fromtimestamp(stats.st_mtime)) elif date is not None:
) create_date = date
self.log("debug", f"Creation date from parse_date: {create_date}")
else:
stats = os.stat(self.path)
create_date = timezone.make_aware(
datetime.datetime.fromtimestamp(stats.st_mtime),
)
self.log("debug", f"Creation date from st_mtime: {create_date}")
storage_type = Document.STORAGE_TYPE_UNENCRYPTED storage_type = Document.STORAGE_TYPE_UNENCRYPTED
@ -400,8 +415,8 @@ class Consumer(LoggingMixin):
content=text, content=text,
mime_type=mime_type, mime_type=mime_type,
checksum=hashlib.md5(f.read()).hexdigest(), checksum=hashlib.md5(f.read()).hexdigest(),
created=created, created=create_date,
modified=created, modified=create_date,
storage_type=storage_type, storage_type=storage_type,
) )

View File

@ -379,6 +379,10 @@ class SavedViewFilterRule(models.Model):
# TODO: why is this in the models file? # TODO: why is this in the models file?
# TODO: how about, what is this and where is it documented?
# It appears to parsing JSON from an environment variable to get a title and date from
# the filename, if possible, as a higher priority than either document filename or
# content parsing
class FileInfo: class FileInfo:
REGEXES = OrderedDict( REGEXES = OrderedDict(
@ -386,8 +390,7 @@ class FileInfo:
( (
"created-title", "created-title",
re.compile( re.compile(
r"^(?P<created>\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " r"^(?P<created>\d{8}(\d{6})?Z) - " r"(?P<title>.*)$",
r"(?P<title>.*)$",
flags=re.IGNORECASE, flags=re.IGNORECASE,
), ),
), ),
@ -427,7 +430,7 @@ class FileInfo:
properties[name] = getattr(cls, f"_get_{name}")(properties[name]) properties[name] = getattr(cls, f"_get_{name}")(properties[name])
@classmethod @classmethod
def from_filename(cls, filename): def from_filename(cls, filename) -> "FileInfo":
# Mutate filename in-place before parsing its components # Mutate filename in-place before parsing its components
# by applying at most one of the configured transformations. # by applying at most one of the configured transformations.
for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS: for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS:

View File

@ -1,3 +1,4 @@
import datetime
import logging import logging
import mimetypes import mimetypes
import os import os
@ -5,6 +6,8 @@ import re
import shutil import shutil
import subprocess import subprocess
import tempfile import tempfile
from typing import Optional
from typing import Set
import magic import magic
from django.conf import settings from django.conf import settings
@ -40,11 +43,11 @@ DATE_REGEX = re.compile(
logger = logging.getLogger("paperless.parsing") logger = logging.getLogger("paperless.parsing")
def is_mime_type_supported(mime_type): def is_mime_type_supported(mime_type) -> bool:
return get_parser_class_for_mime_type(mime_type) is not None return get_parser_class_for_mime_type(mime_type) is not None
def get_default_file_extension(mime_type): def get_default_file_extension(mime_type) -> str:
for response in document_consumer_declaration.send(None): for response in document_consumer_declaration.send(None):
parser_declaration = response[1] parser_declaration = response[1]
supported_mime_types = parser_declaration["mime_types"] supported_mime_types = parser_declaration["mime_types"]
@ -59,14 +62,14 @@ def get_default_file_extension(mime_type):
return "" return ""
def is_file_ext_supported(ext): def is_file_ext_supported(ext) -> bool:
if ext: if ext:
return ext.lower() in get_supported_file_extensions() return ext.lower() in get_supported_file_extensions()
else: else:
return False return False
def get_supported_file_extensions(): def get_supported_file_extensions() -> Set[str]:
extensions = set() extensions = set()
for response in document_consumer_declaration.send(None): for response in document_consumer_declaration.send(None):
parser_declaration = response[1] parser_declaration = response[1]
@ -121,7 +124,7 @@ def run_convert(
auto_orient=False, auto_orient=False,
extra=None, extra=None,
logging_group=None, logging_group=None,
): ) -> None:
environment = os.environ.copy() environment = os.environ.copy()
if settings.CONVERT_MEMORY_LIMIT: if settings.CONVERT_MEMORY_LIMIT:
@ -146,11 +149,11 @@ def run_convert(
raise ParseError(f"Convert failed at {args}") raise ParseError(f"Convert failed at {args}")
def get_default_thumbnail(): def get_default_thumbnail() -> str:
return os.path.join(os.path.dirname(__file__), "resources", "document.png") return os.path.join(os.path.dirname(__file__), "resources", "document.png")
def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None): def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> str:
out_path = os.path.join(temp_dir, "convert_gs.png") out_path = os.path.join(temp_dir, "convert_gs.png")
# if convert fails, fall back to extracting # if convert fails, fall back to extracting
@ -184,7 +187,7 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None):
return get_default_thumbnail() return get_default_thumbnail()
def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None): def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str:
""" """
The thumbnail of a PDF is just a 500px wide image of the first page. The thumbnail of a PDF is just a 500px wide image of the first page.
""" """
@ -209,12 +212,12 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None):
return out_path return out_path
def parse_date(filename, text): def parse_date(filename, text) -> Optional[datetime.datetime]:
""" """
Returns the date of the document. Returns the date of the document.
""" """
def __parser(ds, date_order): def __parser(ds: str, date_order: str) -> datetime.datetime:
""" """
Call dateparser.parse with a particular date ordering Call dateparser.parse with a particular date ordering
""" """
@ -230,9 +233,9 @@ def parse_date(filename, text):
}, },
) )
def __filter(date): def __filter(date: datetime.datetime) -> Optional[datetime.datetime]:
if ( if (
date date is not None
and date.year > 1900 and date.year > 1900
and date <= timezone.now() and date <= timezone.now()
and date.date() not in settings.IGNORE_DATES and date.date() not in settings.IGNORE_DATES
@ -269,7 +272,7 @@ def parse_date(filename, text):
date = __filter(date) date = __filter(date)
if date is not None: if date is not None:
break return date
return date return date
@ -294,7 +297,7 @@ class DocumentParser(LoggingMixin):
self.archive_path = None self.archive_path = None
self.text = None self.text = None
self.date = None self.date: Optional[datetime.datetime] = None
self.progress_callback = progress_callback self.progress_callback = progress_callback
def progress(self, current_progress, max_progress): def progress(self, current_progress, max_progress):
@ -342,7 +345,7 @@ class DocumentParser(LoggingMixin):
def get_text(self): def get_text(self):
return self.text return self.text
def get_date(self): def get_date(self) -> Optional[datetime.datetime]:
return self.date return self.date
def cleanup(self): def cleanup(self):

Binary file not shown.

Binary file not shown.

View File

@ -1,3 +1,4 @@
import datetime
import os import os
import re import re
import shutil import shutil
@ -5,6 +6,8 @@ import tempfile
from unittest import mock from unittest import mock
from unittest.mock import MagicMock from unittest.mock import MagicMock
from dateutil import tz
try: try:
import zoneinfo import zoneinfo
except ImportError: except ImportError:
@ -502,7 +505,7 @@ class TestConsumer(DirectoriesMixin, TestCase):
self.assertRaisesMessage( self.assertRaisesMessage(
ConsumerError, ConsumerError,
"sample.pdf: The following error occured while consuming sample.pdf: NO.", "sample.pdf: The following error occurred while consuming sample.pdf: NO.",
self.consumer.try_consume_file, self.consumer.try_consume_file,
filename, filename,
) )
@ -654,6 +657,127 @@ class TestConsumer(DirectoriesMixin, TestCase):
sanity_check() sanity_check()
@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file)
class TestConsumerCreatedDate(DirectoriesMixin, TestCase):
def setUp(self):
super(TestConsumerCreatedDate, self).setUp()
# this prevents websocket message reports during testing.
patcher = mock.patch("documents.consumer.Consumer._send_progress")
self._send_progress = patcher.start()
self.addCleanup(patcher.stop)
self.consumer = Consumer()
def test_consume_date_from_content(self):
"""
GIVEN:
- File content with date in DMY (default) format
THEN:
- Should parse the date from the file content
"""
src = os.path.join(
os.path.dirname(__file__),
"samples",
"documents",
"originals",
"0000005.pdf",
)
dst = os.path.join(self.dirs.scratch_dir, "sample.pdf")
shutil.copy(src, dst)
document = self.consumer.try_consume_file(dst)
self.assertEqual(
document.created,
datetime.datetime(1996, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(FILENAME_DATE_ORDER="YMD")
def test_consume_date_from_filename(self):
"""
GIVEN:
- File content with date in DMY (default) format
- Filename with date in YMD format
THEN:
- Should parse the date from the filename
"""
src = os.path.join(
os.path.dirname(__file__),
"samples",
"documents",
"originals",
"0000005.pdf",
)
dst = os.path.join(self.dirs.scratch_dir, "Scan - 2022-02-01.pdf")
shutil.copy(src, dst)
document = self.consumer.try_consume_file(dst)
self.assertEqual(
document.created,
datetime.datetime(2022, 2, 1, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
def test_consume_date_filename_date_use_content(self):
"""
GIVEN:
- File content with date in DMY (default) format
- Filename date parsing disabled
- Filename with date in YMD format
THEN:
- Should parse the date from the content
"""
src = os.path.join(
os.path.dirname(__file__),
"samples",
"documents",
"originals",
"0000005.pdf",
)
dst = os.path.join(self.dirs.scratch_dir, "Scan - 2022-02-01.pdf")
shutil.copy(src, dst)
document = self.consumer.try_consume_file(dst)
self.assertEqual(
document.created,
datetime.datetime(1996, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(
IGNORE_DATES=(datetime.date(2010, 12, 13), datetime.date(2011, 11, 12)),
)
def test_consume_date_use_content_with_ignore(self):
"""
GIVEN:
- File content with dates in DMY (default) format
- File content includes ignored dates
THEN:
- Should parse the date from the filename
"""
src = os.path.join(
os.path.dirname(__file__),
"samples",
"documents",
"originals",
"0000006.pdf",
)
dst = os.path.join(self.dirs.scratch_dir, "0000006.pdf")
shutil.copy(src, dst)
document = self.consumer.try_consume_file(dst)
self.assertEqual(
document.created,
datetime.datetime(1997, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
class PreConsumeTestCase(TestCase): class PreConsumeTestCase(TestCase):
@mock.patch("documents.consumer.Popen") @mock.patch("documents.consumer.Popen")
@override_settings(PRE_CONSUME_SCRIPT=None) @override_settings(PRE_CONSUME_SCRIPT=None)

View File

@ -8,6 +8,7 @@ from django.conf import settings
from django.test import override_settings from django.test import override_settings
from django.test import TestCase from django.test import TestCase
from documents.parsers import parse_date from documents.parsers import parse_date
from paperless.settings import DATE_ORDER
class TestDate(TestCase): class TestDate(TestCase):
@ -160,19 +161,112 @@ class TestDate(TestCase):
def test_crazy_date_with_spaces(self, *args): def test_crazy_date_with_spaces(self, *args):
self.assertIsNone(parse_date("", "20 408000l 2475")) self.assertIsNone(parse_date("", "20 408000l 2475"))
@override_settings(FILENAME_DATE_ORDER="YMD")
def test_filename_date_parse_valid_ymd(self, *args):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename date format is with Year Month Day (YMD)
- Filename contains date matching the format
THEN:
- Should parse the date from the filename
"""
self.assertEqual(
parse_date("/tmp/Scan-2022-04-01.pdf", "No date in here"),
datetime.datetime(2022, 4, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(FILENAME_DATE_ORDER="DMY")
def test_filename_date_parse_valid_dmy(self, *args):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename date format is with Day Month Year (DMY)
- Filename contains date matching the format
THEN:
- Should parse the date from the filename
"""
self.assertEqual(
parse_date("/tmp/Scan-10.01.2021.pdf", "No date in here"),
datetime.datetime(2021, 1, 10, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(FILENAME_DATE_ORDER="YMD") @override_settings(FILENAME_DATE_ORDER="YMD")
def test_filename_date_parse_invalid(self, *args): def test_filename_date_parse_invalid(self, *args):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename includes no date
- File content includes no date
THEN:
- No date is parsed
"""
self.assertIsNone( self.assertIsNone(
parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"), parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"),
) )
@override_settings(
FILENAME_DATE_ORDER="YMD",
IGNORE_DATES=(datetime.date(2022, 4, 1),),
)
def test_filename_date_ignored_use_content(self, *args):
"""
GIVEN:
- Date parsing from the filename is enabled
- Filename date format is with Day Month Year (YMD)
- Date order is Day Month Year (DMY, the default)
- Filename contains date matching the format
- Filename date is an ignored date
- File content includes a date
THEN:
- Should parse the date from the content not filename
"""
self.assertEqual(
parse_date("/tmp/Scan-2022-04-01.pdf", "The matching date is 24.03.2022"),
datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings( @override_settings(
IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)), IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)),
) )
def test_ignored_dates(self, *args): def test_ignored_dates_default_order(self, *args):
"""
GIVEN:
- Ignore dates have been set
- File content includes ignored dates
- File content includes 1 non-ignored date
THEN:
- Should parse the date non-ignored date from content
"""
text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem " "ipsum" text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem " "ipsum"
date = parse_date("", text)
self.assertEqual( self.assertEqual(
date, parse_date("", text),
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
)
@override_settings(
IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)),
DATE_ORDER="YMD",
)
def test_ignored_dates_order_ymd(self, *args):
"""
GIVEN:
- Ignore dates have been set
- Date order is Year Month Date (YMD)
- File content includes ignored dates
- File content includes 1 non-ignored date
THEN:
- Should parse the date non-ignored date from content
"""
text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem " "ipsum"
self.assertEqual(
parse_date("", text),
datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)),
) )

View File

@ -1,9 +1,11 @@
import datetime
import json import json
import math import math
import multiprocessing import multiprocessing
import os import os
import re import re
from typing import Final from typing import Final
from typing import Set
from urllib.parse import urlparse from urllib.parse import urlparse
from concurrent_log_handler.queue import setup_logging_queues from concurrent_log_handler.queue import setup_logging_queues
@ -603,16 +605,42 @@ PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv(
if PAPERLESS_TIKA_ENABLED: if PAPERLESS_TIKA_ENABLED:
INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig") INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")
# List dates that should be ignored when trying to parse date from document text
IGNORE_DATES = set()
if os.getenv("PAPERLESS_IGNORE_DATES", ""): def _parse_ignore_dates(
env_ignore: str,
date_order: str = DATE_ORDER,
) -> Set[datetime.datetime]:
"""
If the PAPERLESS_IGNORE_DATES environment variable is set, parse the
user provided string(s) into dates
Args:
env_ignore (str): The value of the environment variable, comma seperated dates
date_order (str, optional): The format of the date strings. Defaults to DATE_ORDER.
Returns:
Set[datetime.datetime]: The set of parsed date objects
"""
import dateparser import dateparser
for s in os.getenv("PAPERLESS_IGNORE_DATES", "").split(","): ignored_dates = set()
d = dateparser.parse(s) for s in env_ignore.split(","):
d = dateparser.parse(
s,
settings={
"DATE_ORDER": date_order,
},
)
if d: if d:
IGNORE_DATES.add(d.date()) ignored_dates.add(d.date())
return ignored_dates
# List dates that should be ignored when trying to parse date from document text
IGNORE_DATES: Set[datetime.date] = set()
if os.getenv("PAPERLESS_IGNORE_DATES") is not None:
IGNORE_DATES = _parse_ignore_dates(os.getenv("PAPERLESS_IGNORE_DATES"))
ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default") ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default")
if ENABLE_UPDATE_CHECK != "default": if ENABLE_UPDATE_CHECK != "default":

View File

@ -0,0 +1,58 @@
import datetime
from unittest import TestCase
from paperless.settings import _parse_ignore_dates
class TestIgnoreDateParsing(TestCase):
"""
Tests the parsing of the PAPERLESS_IGNORE_DATES setting value
"""
def _parse_checker(self, test_cases):
"""
Helper function to check ignore date parsing
Args:
test_cases (_type_): _description_
"""
for env_str, date_format, expected_date_set in test_cases:
self.assertSetEqual(
_parse_ignore_dates(env_str, date_format),
expected_date_set,
)
def test_no_ignore_dates_set(self):
"""
GIVEN:
- No ignore dates are set
THEN:
- No ignore dates are parsed
"""
self.assertSetEqual(_parse_ignore_dates(""), set())
def test_single_ignore_dates_set(self):
"""
GIVEN:
- Ignore dates are set per certain inputs
THEN:
- All ignore dates are parsed
"""
test_cases = [
("1985-05-01", "YMD", {datetime.date(1985, 5, 1)}),
(
"1985-05-01,1991-12-05",
"YMD",
{datetime.date(1985, 5, 1), datetime.date(1991, 12, 5)},
),
("2010-12-13", "YMD", {datetime.date(2010, 12, 13)}),
("11.01.10", "DMY", {datetime.date(2010, 1, 11)}),
(
"11.01.2001,15-06-1996",
"DMY",
{datetime.date(2001, 1, 11), datetime.date(1996, 6, 15)},
),
]
self._parse_checker(test_cases)