diff --git a/docs/configuration.rst b/docs/configuration.rst index 92cba8d72..3a4960f82 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -650,7 +650,6 @@ PAPERLESS_CONSUMER_BARCODE_STRING=PATCHT Defaults to "PATCHT" - PAPERLESS_CONVERT_MEMORY_LIMIT= On smaller systems, or even in the case of Very Large Documents, the consumer may explode, complaining about how it's "unable to extend pixel cache". In @@ -696,6 +695,9 @@ PAPERLESS_FILENAME_DATE_ORDER= The filename will be checked first, and if nothing is found, the document text will be checked as normal. + A date in a filename must have some separators (`.`, `-`, `/`, etc) + for it to be parsed. + Defaults to none, which disables this feature. PAPERLESS_THUMBNAIL_FONT_NAME= @@ -713,10 +715,7 @@ PAPERLESS_IGNORE_DATES= this process. This is useful for special dates (like date of birth) that appear in documents regularly but are very unlikely to be the documents creation date. - You may specify dates in a multitude of formats supported by dateparser (see - https://dateparser.readthedocs.io/en/latest/#popular-formats) but as the dates - need to be comma separated, the options are limited. - Example: "2020-12-02,22.04.1999" + The date is parsed using the order specified in PAPERLESS_DATE_ORDER Defaults to an empty string to not ignore any dates. diff --git a/src/documents/consumer.py b/src/documents/consumer.py index a59f0bfd7..061f17ee1 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -3,6 +3,8 @@ import hashlib import os import uuid from subprocess import Popen +from typing import Optional +from typing import Type import magic from asgiref.sync import async_to_sync @@ -23,6 +25,7 @@ from .models import Document from .models import DocumentType from .models import FileInfo from .models import Tag +from .parsers import DocumentParser from .parsers import get_parser_class_for_mime_type from .parsers import parse_date from .parsers import ParseError @@ -186,7 +189,7 @@ class Consumer(LoggingMixin): override_document_type_id=None, override_tag_ids=None, task_id=None, - ): + ) -> Document: """ Return the document object if it was successfully created. """ @@ -220,7 +223,10 @@ class Consumer(LoggingMixin): self.log("debug", f"Detected mime type: {mime_type}") - parser_class = get_parser_class_for_mime_type(mime_type) + # Based on the mime type, get the parser for that type + parser_class: Optional[Type[DocumentParser]] = get_parser_class_for_mime_type( + mime_type, + ) if not parser_class: self._fail(MESSAGE_UNSUPPORTED_TYPE, f"Unsupported mime type {mime_type}") @@ -241,7 +247,10 @@ class Consumer(LoggingMixin): # This doesn't parse the document yet, but gives us a parser. - document_parser = parser_class(self.logging_group, progress_callback) + document_parser: DocumentParser = parser_class( + self.logging_group, + progress_callback, + ) self.log("debug", f"Parser: {type(document_parser).__name__}") @@ -270,7 +279,7 @@ class Consumer(LoggingMixin): text = document_parser.get_text() date = document_parser.get_date() - if not date: + if date is None: self._send_progress(90, 100, "WORKING", MESSAGE_PARSE_DATE) date = parse_date(self.filename, text) archive_path = document_parser.get_archive_path() @@ -342,7 +351,7 @@ class Consumer(LoggingMixin): ).hexdigest() # Don't save with the lock active. Saving will cause the file - # renaming logic to aquire the lock as well. + # renaming logic to acquire the lock as well. document.save() # Delete the file only if it was successfully consumed @@ -362,7 +371,8 @@ class Consumer(LoggingMixin): except Exception as e: self._fail( str(e), - f"The following error occured while consuming " f"{self.filename}: {e}", + f"The following error occurred while consuming " + f"{self.filename}: {e}", exc_info=True, ) finally: @@ -376,21 +386,26 @@ class Consumer(LoggingMixin): return document - def _store(self, text, date, mime_type): + def _store(self, text, date, mime_type) -> Document: # If someone gave us the original filename, use it instead of doc. file_info = FileInfo.from_filename(self.filename) - stats = os.stat(self.path) - self.log("debug", "Saving record to database") - created = ( - file_info.created - or date - or timezone.make_aware(datetime.datetime.fromtimestamp(stats.st_mtime)) - ) + if file_info.created is not None: + create_date = file_info.created + self.log("debug", f"Creation date from FileInfo: {create_date}") + elif date is not None: + create_date = date + self.log("debug", f"Creation date from parse_date: {create_date}") + else: + stats = os.stat(self.path) + create_date = timezone.make_aware( + datetime.datetime.fromtimestamp(stats.st_mtime), + ) + self.log("debug", f"Creation date from st_mtime: {create_date}") storage_type = Document.STORAGE_TYPE_UNENCRYPTED @@ -400,8 +415,8 @@ class Consumer(LoggingMixin): content=text, mime_type=mime_type, checksum=hashlib.md5(f.read()).hexdigest(), - created=created, - modified=created, + created=create_date, + modified=create_date, storage_type=storage_type, ) diff --git a/src/documents/models.py b/src/documents/models.py index fcb4be382..ecabe085c 100644 --- a/src/documents/models.py +++ b/src/documents/models.py @@ -379,6 +379,10 @@ class SavedViewFilterRule(models.Model): # TODO: why is this in the models file? +# TODO: how about, what is this and where is it documented? +# It appears to parsing JSON from an environment variable to get a title and date from +# the filename, if possible, as a higher priority than either document filename or +# content parsing class FileInfo: REGEXES = OrderedDict( @@ -386,8 +390,7 @@ class FileInfo: ( "created-title", re.compile( - r"^(?P\d\d\d\d\d\d\d\d(\d\d\d\d\d\d)?Z) - " - r"(?P.*)$", + r"^(?P<created>\d{8}(\d{6})?Z) - " r"(?P<title>.*)$", flags=re.IGNORECASE, ), ), @@ -427,7 +430,7 @@ class FileInfo: properties[name] = getattr(cls, f"_get_{name}")(properties[name]) @classmethod - def from_filename(cls, filename): + def from_filename(cls, filename) -> "FileInfo": # Mutate filename in-place before parsing its components # by applying at most one of the configured transformations. for (pattern, repl) in settings.FILENAME_PARSE_TRANSFORMS: diff --git a/src/documents/parsers.py b/src/documents/parsers.py index be4db1e71..469ec2f1e 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -1,3 +1,4 @@ +import datetime import logging import mimetypes import os @@ -5,6 +6,8 @@ import re import shutil import subprocess import tempfile +from typing import Optional +from typing import Set import magic from django.conf import settings @@ -40,11 +43,11 @@ DATE_REGEX = re.compile( logger = logging.getLogger("paperless.parsing") -def is_mime_type_supported(mime_type): +def is_mime_type_supported(mime_type) -> bool: return get_parser_class_for_mime_type(mime_type) is not None -def get_default_file_extension(mime_type): +def get_default_file_extension(mime_type) -> str: for response in document_consumer_declaration.send(None): parser_declaration = response[1] supported_mime_types = parser_declaration["mime_types"] @@ -59,14 +62,14 @@ def get_default_file_extension(mime_type): return "" -def is_file_ext_supported(ext): +def is_file_ext_supported(ext) -> bool: if ext: return ext.lower() in get_supported_file_extensions() else: return False -def get_supported_file_extensions(): +def get_supported_file_extensions() -> Set[str]: extensions = set() for response in document_consumer_declaration.send(None): parser_declaration = response[1] @@ -121,7 +124,7 @@ def run_convert( auto_orient=False, extra=None, logging_group=None, -): +) -> None: environment = os.environ.copy() if settings.CONVERT_MEMORY_LIMIT: @@ -146,11 +149,11 @@ def run_convert( raise ParseError(f"Convert failed at {args}") -def get_default_thumbnail(): +def get_default_thumbnail() -> str: return os.path.join(os.path.dirname(__file__), "resources", "document.png") -def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None): +def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None) -> str: out_path = os.path.join(temp_dir, "convert_gs.png") # if convert fails, fall back to extracting @@ -184,7 +187,7 @@ def make_thumbnail_from_pdf_gs_fallback(in_path, temp_dir, logging_group=None): return get_default_thumbnail() -def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None): +def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None) -> str: """ The thumbnail of a PDF is just a 500px wide image of the first page. """ @@ -209,12 +212,12 @@ def make_thumbnail_from_pdf(in_path, temp_dir, logging_group=None): return out_path -def parse_date(filename, text): +def parse_date(filename, text) -> Optional[datetime.datetime]: """ Returns the date of the document. """ - def __parser(ds, date_order): + def __parser(ds: str, date_order: str) -> datetime.datetime: """ Call dateparser.parse with a particular date ordering """ @@ -230,9 +233,9 @@ def parse_date(filename, text): }, ) - def __filter(date): + def __filter(date: datetime.datetime) -> Optional[datetime.datetime]: if ( - date + date is not None and date.year > 1900 and date <= timezone.now() and date.date() not in settings.IGNORE_DATES @@ -269,7 +272,7 @@ def parse_date(filename, text): date = __filter(date) if date is not None: - break + return date return date @@ -294,7 +297,7 @@ class DocumentParser(LoggingMixin): self.archive_path = None self.text = None - self.date = None + self.date: Optional[datetime.datetime] = None self.progress_callback = progress_callback def progress(self, current_progress, max_progress): @@ -342,7 +345,7 @@ class DocumentParser(LoggingMixin): def get_text(self): return self.text - def get_date(self): + def get_date(self) -> Optional[datetime.datetime]: return self.date def cleanup(self): diff --git a/src/documents/tests/samples/documents/originals/0000005.pdf b/src/documents/tests/samples/documents/originals/0000005.pdf new file mode 100755 index 000000000..cc78528f5 Binary files /dev/null and b/src/documents/tests/samples/documents/originals/0000005.pdf differ diff --git a/src/documents/tests/samples/documents/originals/0000006.pdf b/src/documents/tests/samples/documents/originals/0000006.pdf new file mode 100755 index 000000000..c66896b4e Binary files /dev/null and b/src/documents/tests/samples/documents/originals/0000006.pdf differ diff --git a/src/documents/tests/test_consumer.py b/src/documents/tests/test_consumer.py index 5592d74d7..1ff83705b 100644 --- a/src/documents/tests/test_consumer.py +++ b/src/documents/tests/test_consumer.py @@ -1,3 +1,4 @@ +import datetime import os import re import shutil @@ -5,6 +6,8 @@ import tempfile from unittest import mock from unittest.mock import MagicMock +from dateutil import tz + try: import zoneinfo except ImportError: @@ -502,7 +505,7 @@ class TestConsumer(DirectoriesMixin, TestCase): self.assertRaisesMessage( ConsumerError, - "sample.pdf: The following error occured while consuming sample.pdf: NO.", + "sample.pdf: The following error occurred while consuming sample.pdf: NO.", self.consumer.try_consume_file, filename, ) @@ -654,6 +657,127 @@ class TestConsumer(DirectoriesMixin, TestCase): sanity_check() +@mock.patch("documents.consumer.magic.from_file", fake_magic_from_file) +class TestConsumerCreatedDate(DirectoriesMixin, TestCase): + def setUp(self): + super(TestConsumerCreatedDate, self).setUp() + + # this prevents websocket message reports during testing. + patcher = mock.patch("documents.consumer.Consumer._send_progress") + self._send_progress = patcher.start() + self.addCleanup(patcher.stop) + + self.consumer = Consumer() + + def test_consume_date_from_content(self): + """ + GIVEN: + - File content with date in DMY (default) format + + THEN: + - Should parse the date from the file content + """ + src = os.path.join( + os.path.dirname(__file__), + "samples", + "documents", + "originals", + "0000005.pdf", + ) + dst = os.path.join(self.dirs.scratch_dir, "sample.pdf") + shutil.copy(src, dst) + + document = self.consumer.try_consume_file(dst) + + self.assertEqual( + document.created, + datetime.datetime(1996, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)), + ) + + @override_settings(FILENAME_DATE_ORDER="YMD") + def test_consume_date_from_filename(self): + """ + GIVEN: + - File content with date in DMY (default) format + - Filename with date in YMD format + + THEN: + - Should parse the date from the filename + """ + src = os.path.join( + os.path.dirname(__file__), + "samples", + "documents", + "originals", + "0000005.pdf", + ) + dst = os.path.join(self.dirs.scratch_dir, "Scan - 2022-02-01.pdf") + shutil.copy(src, dst) + + document = self.consumer.try_consume_file(dst) + + self.assertEqual( + document.created, + datetime.datetime(2022, 2, 1, tzinfo=tz.gettz(settings.TIME_ZONE)), + ) + + def test_consume_date_filename_date_use_content(self): + """ + GIVEN: + - File content with date in DMY (default) format + - Filename date parsing disabled + - Filename with date in YMD format + + THEN: + - Should parse the date from the content + """ + src = os.path.join( + os.path.dirname(__file__), + "samples", + "documents", + "originals", + "0000005.pdf", + ) + dst = os.path.join(self.dirs.scratch_dir, "Scan - 2022-02-01.pdf") + shutil.copy(src, dst) + + document = self.consumer.try_consume_file(dst) + + self.assertEqual( + document.created, + datetime.datetime(1996, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)), + ) + + @override_settings( + IGNORE_DATES=(datetime.date(2010, 12, 13), datetime.date(2011, 11, 12)), + ) + def test_consume_date_use_content_with_ignore(self): + """ + GIVEN: + - File content with dates in DMY (default) format + - File content includes ignored dates + + THEN: + - Should parse the date from the filename + """ + src = os.path.join( + os.path.dirname(__file__), + "samples", + "documents", + "originals", + "0000006.pdf", + ) + dst = os.path.join(self.dirs.scratch_dir, "0000006.pdf") + shutil.copy(src, dst) + + document = self.consumer.try_consume_file(dst) + + self.assertEqual( + document.created, + datetime.datetime(1997, 2, 20, tzinfo=tz.gettz(settings.TIME_ZONE)), + ) + + class PreConsumeTestCase(TestCase): @mock.patch("documents.consumer.Popen") @override_settings(PRE_CONSUME_SCRIPT=None) diff --git a/src/documents/tests/test_date_parsing.py b/src/documents/tests/test_date_parsing.py index ae9589ad7..1019c572f 100644 --- a/src/documents/tests/test_date_parsing.py +++ b/src/documents/tests/test_date_parsing.py @@ -8,6 +8,7 @@ from django.conf import settings from django.test import override_settings from django.test import TestCase from documents.parsers import parse_date +from paperless.settings import DATE_ORDER class TestDate(TestCase): @@ -160,19 +161,112 @@ class TestDate(TestCase): def test_crazy_date_with_spaces(self, *args): self.assertIsNone(parse_date("", "20 408000l 2475")) + @override_settings(FILENAME_DATE_ORDER="YMD") + def test_filename_date_parse_valid_ymd(self, *args): + """ + GIVEN: + - Date parsing from the filename is enabled + - Filename date format is with Year Month Day (YMD) + - Filename contains date matching the format + + THEN: + - Should parse the date from the filename + """ + self.assertEqual( + parse_date("/tmp/Scan-2022-04-01.pdf", "No date in here"), + datetime.datetime(2022, 4, 1, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + ) + + @override_settings(FILENAME_DATE_ORDER="DMY") + def test_filename_date_parse_valid_dmy(self, *args): + """ + GIVEN: + - Date parsing from the filename is enabled + - Filename date format is with Day Month Year (DMY) + - Filename contains date matching the format + + THEN: + - Should parse the date from the filename + """ + self.assertEqual( + parse_date("/tmp/Scan-10.01.2021.pdf", "No date in here"), + datetime.datetime(2021, 1, 10, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + ) + @override_settings(FILENAME_DATE_ORDER="YMD") def test_filename_date_parse_invalid(self, *args): + """ + GIVEN: + - Date parsing from the filename is enabled + - Filename includes no date + - File content includes no date + + THEN: + - No date is parsed + """ self.assertIsNone( parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here"), ) + @override_settings( + FILENAME_DATE_ORDER="YMD", + IGNORE_DATES=(datetime.date(2022, 4, 1),), + ) + def test_filename_date_ignored_use_content(self, *args): + """ + GIVEN: + - Date parsing from the filename is enabled + - Filename date format is with Day Month Year (YMD) + - Date order is Day Month Year (DMY, the default) + - Filename contains date matching the format + - Filename date is an ignored date + - File content includes a date + + THEN: + - Should parse the date from the content not filename + """ + self.assertEqual( + parse_date("/tmp/Scan-2022-04-01.pdf", "The matching date is 24.03.2022"), + datetime.datetime(2022, 3, 24, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + ) + @override_settings( IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)), ) - def test_ignored_dates(self, *args): + def test_ignored_dates_default_order(self, *args): + """ + GIVEN: + - Ignore dates have been set + - File content includes ignored dates + - File content includes 1 non-ignored date + + THEN: + - Should parse the date non-ignored date from content + """ text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem " "ipsum" - date = parse_date("", text) self.assertEqual( - date, + parse_date("", text), + datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), + ) + + @override_settings( + IGNORE_DATES=(datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)), + DATE_ORDER="YMD", + ) + def test_ignored_dates_order_ymd(self, *args): + """ + GIVEN: + - Ignore dates have been set + - Date order is Year Month Date (YMD) + - File content includes ignored dates + - File content includes 1 non-ignored date + + THEN: + - Should parse the date non-ignored date from content + """ + text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem " "ipsum" + + self.assertEqual( + parse_date("", text), datetime.datetime(2018, 2, 13, 0, 0, tzinfo=tz.gettz(settings.TIME_ZONE)), ) diff --git a/src/paperless/settings.py b/src/paperless/settings.py index 5274c356a..b5be6c420 100644 --- a/src/paperless/settings.py +++ b/src/paperless/settings.py @@ -1,9 +1,11 @@ +import datetime import json import math import multiprocessing import os import re from typing import Final +from typing import Set from urllib.parse import urlparse from concurrent_log_handler.queue import setup_logging_queues @@ -603,16 +605,42 @@ PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv( if PAPERLESS_TIKA_ENABLED: INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig") -# List dates that should be ignored when trying to parse date from document text -IGNORE_DATES = set() -if os.getenv("PAPERLESS_IGNORE_DATES", ""): +def _parse_ignore_dates( + env_ignore: str, + date_order: str = DATE_ORDER, +) -> Set[datetime.datetime]: + """ + If the PAPERLESS_IGNORE_DATES environment variable is set, parse the + user provided string(s) into dates + + Args: + env_ignore (str): The value of the environment variable, comma seperated dates + date_order (str, optional): The format of the date strings. Defaults to DATE_ORDER. + + Returns: + Set[datetime.datetime]: The set of parsed date objects + """ import dateparser - for s in os.getenv("PAPERLESS_IGNORE_DATES", "").split(","): - d = dateparser.parse(s) + ignored_dates = set() + for s in env_ignore.split(","): + d = dateparser.parse( + s, + settings={ + "DATE_ORDER": date_order, + }, + ) if d: - IGNORE_DATES.add(d.date()) + ignored_dates.add(d.date()) + return ignored_dates + + +# List dates that should be ignored when trying to parse date from document text +IGNORE_DATES: Set[datetime.date] = set() + +if os.getenv("PAPERLESS_IGNORE_DATES") is not None: + IGNORE_DATES = _parse_ignore_dates(os.getenv("PAPERLESS_IGNORE_DATES")) ENABLE_UPDATE_CHECK = os.getenv("PAPERLESS_ENABLE_UPDATE_CHECK", "default") if ENABLE_UPDATE_CHECK != "default": diff --git a/src/paperless/tests/test_settings.py b/src/paperless/tests/test_settings.py new file mode 100644 index 000000000..57481df5b --- /dev/null +++ b/src/paperless/tests/test_settings.py @@ -0,0 +1,58 @@ +import datetime +from unittest import TestCase + +from paperless.settings import _parse_ignore_dates + + +class TestIgnoreDateParsing(TestCase): + """ + Tests the parsing of the PAPERLESS_IGNORE_DATES setting value + """ + + def _parse_checker(self, test_cases): + """ + Helper function to check ignore date parsing + + Args: + test_cases (_type_): _description_ + """ + for env_str, date_format, expected_date_set in test_cases: + + self.assertSetEqual( + _parse_ignore_dates(env_str, date_format), + expected_date_set, + ) + + def test_no_ignore_dates_set(self): + """ + GIVEN: + - No ignore dates are set + THEN: + - No ignore dates are parsed + """ + self.assertSetEqual(_parse_ignore_dates(""), set()) + + def test_single_ignore_dates_set(self): + """ + GIVEN: + - Ignore dates are set per certain inputs + THEN: + - All ignore dates are parsed + """ + test_cases = [ + ("1985-05-01", "YMD", {datetime.date(1985, 5, 1)}), + ( + "1985-05-01,1991-12-05", + "YMD", + {datetime.date(1985, 5, 1), datetime.date(1991, 12, 5)}, + ), + ("2010-12-13", "YMD", {datetime.date(2010, 12, 13)}), + ("11.01.10", "DMY", {datetime.date(2010, 1, 11)}), + ( + "11.01.2001,15-06-1996", + "DMY", + {datetime.date(2001, 1, 11), datetime.date(1996, 6, 15)}, + ), + ] + + self._parse_checker(test_cases)