From 32771391ade911550bfc7197f83b2efbb3ca321a Mon Sep 17 00:00:00 2001 From: Trenton H <797416+stumpylog@users.noreply.github.com> Date: Wed, 28 Jan 2026 14:13:29 -0800 Subject: [PATCH] Hooks up the class and fixes up the old testing. Includes ocr to date parser conversion we now do --- src/documents/consumer.py | 5 +- src/documents/parsers.py | 74 --- .../plugins/date_parsing/__init__.py | 6 +- src/documents/tests/test_api_documents.py | 7 +- src/documents/tests/test_date_parsing.py | 538 ------------------ src/documents/views.py | 5 +- 6 files changed, 15 insertions(+), 620 deletions(-) delete mode 100644 src/documents/tests/test_date_parsing.py diff --git a/src/documents/consumer.py b/src/documents/consumer.py index 1ff60220b..b47962d6e 100644 --- a/src/documents/consumer.py +++ b/src/documents/consumer.py @@ -32,12 +32,12 @@ from documents.models import WorkflowTrigger from documents.parsers import DocumentParser from documents.parsers import ParseError from documents.parsers import get_parser_class_for_mime_type -from documents.parsers import parse_date from documents.permissions import set_permissions_for_object from documents.plugins.base import AlwaysRunPluginMixin from documents.plugins.base import ConsumeTaskPlugin from documents.plugins.base import NoCleanupPluginMixin from documents.plugins.base import NoSetupPluginMixin +from documents.plugins.date_parsing import get_date_parser from documents.plugins.helpers import ProgressManager from documents.plugins.helpers import ProgressStatusOptions from documents.signals import document_consumption_finished @@ -426,7 +426,8 @@ class ConsumerPlugin( ProgressStatusOptions.WORKING, ConsumerStatusShortMessage.PARSE_DATE, ) - date = parse_date(self.filename, text) + date_parser = get_date_parser() + date = next(date_parser.parse(self.filename, text), None) archive_path = document_parser.get_archive_path() page_count = document_parser.get_page_count(self.working_copy, mime_type) diff --git a/src/documents/parsers.py b/src/documents/parsers.py index f6417e285..96b43f150 100644 --- a/src/documents/parsers.py +++ b/src/documents/parsers.py @@ -9,22 +9,17 @@ import subprocess import tempfile from functools import lru_cache from pathlib import Path -from re import Match from typing import TYPE_CHECKING from django.conf import settings -from django.utils import timezone from documents.loggers import LoggingMixin from documents.signals import document_consumer_declaration from documents.utils import copy_file_with_basic_stats from documents.utils import run_subprocess -from paperless.config import OcrConfig -from paperless.utils import ocr_to_dateparser_languages if TYPE_CHECKING: import datetime - from collections.abc import Iterator # This regular expression will try to find dates in the document at # hand and will match the following formats: @@ -259,75 +254,6 @@ def make_thumbnail_from_pdf(in_path: Path, temp_dir: Path, logging_group=None) - return out_path -def parse_date(filename, text) -> datetime.datetime | None: - return next(parse_date_generator(filename, text), None) - - -def parse_date_generator(filename, text) -> Iterator[datetime.datetime]: - """ - Returns the date of the document. - """ - - def __parser(ds: str, date_order: str) -> datetime.datetime: - """ - Call dateparser.parse with a particular date ordering - """ - import dateparser - - ocr_config = OcrConfig() - languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages( - ocr_config.language, - ) - - return dateparser.parse( - ds, - settings={ - "DATE_ORDER": date_order, - "PREFER_DAY_OF_MONTH": "first", - "RETURN_AS_TIMEZONE_AWARE": True, - "TIMEZONE": settings.TIME_ZONE, - }, - locales=languages, - ) - - def __filter(date: datetime.datetime) -> datetime.datetime | None: - if ( - date is not None - and date.year > 1900 - and date <= timezone.now() - and date.date() not in settings.IGNORE_DATES - ): - return date - return None - - def __process_match( - match: Match[str], - date_order: str, - ) -> datetime.datetime | None: - date_string = match.group(0) - - try: - date = __parser(date_string, date_order) - except Exception: - # Skip all matches that do not parse to a proper date - date = None - - return __filter(date) - - def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]: - for m in re.finditer(DATE_REGEX, content): - date = __process_match(m, date_order) - if date is not None: - yield date - - # if filename date parsing is enabled, search there first: - if settings.FILENAME_DATE_ORDER: - yield from __process_content(filename, settings.FILENAME_DATE_ORDER) - - # Iterate through all regex matches in text and try to parse the date - yield from __process_content(text, settings.DATE_ORDER) - - class ParseError(Exception): pass diff --git a/src/documents/plugins/date_parsing/__init__.py b/src/documents/plugins/date_parsing/__init__.py index f540ce155..a828c13ca 100644 --- a/src/documents/plugins/date_parsing/__init__.py +++ b/src/documents/plugins/date_parsing/__init__.py @@ -10,6 +10,7 @@ from django.utils import timezone from documents.plugins.date_parsing.base import DateParserConfig from documents.plugins.date_parsing.base import DateParserPluginBase from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin +from paperless.utils import ocr_to_dateparser_languages logger = logging.getLogger(__name__) @@ -73,7 +74,10 @@ def get_date_parser() -> DateParserPluginBase: # 2. Load configuration from settings # TODO: Get the language from the settings and/or configuration object, depending - languages = settings.DATE_PARSER_LANGUAGES + languages = languages = ( + settings.DATE_PARSER_LANGUAGES + or ocr_to_dateparser_languages(settings.OCR_LANGUAGE) + ) config = DateParserConfig( languages=languages, diff --git a/src/documents/tests/test_api_documents.py b/src/documents/tests/test_api_documents.py index 96d22dc2c..d7b176544 100644 --- a/src/documents/tests/test_api_documents.py +++ b/src/documents/tests/test_api_documents.py @@ -1978,11 +1978,11 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase): response = self.client.get(f"/api/documents/{doc.pk}/suggestions/") self.assertEqual(response.status_code, status.HTTP_200_OK) - @mock.patch("documents.parsers.parse_date_generator") + @mock.patch("documents.views.get_date_parser") @override_settings(NUMBER_OF_SUGGESTED_DATES=0) def test_get_suggestions_dates_disabled( self, - parse_date_generator, + mock_get_date_parser: mock.MagicMock, ): """ GIVEN: @@ -1999,7 +1999,8 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase): ) self.client.get(f"/api/documents/{doc.pk}/suggestions/") - self.assertFalse(parse_date_generator.called) + + mock_get_date_parser.assert_not_called() def test_saved_views(self): u1 = User.objects.create_superuser("user1") diff --git a/src/documents/tests/test_date_parsing.py b/src/documents/tests/test_date_parsing.py deleted file mode 100644 index f565a9544..000000000 --- a/src/documents/tests/test_date_parsing.py +++ /dev/null @@ -1,538 +0,0 @@ -import datetime -from zoneinfo import ZoneInfo - -import pytest -from pytest_django.fixtures import SettingsWrapper - -from documents.parsers import parse_date -from documents.parsers import parse_date_generator - - -@pytest.mark.django_db() -class TestDate: - def test_date_format_1(self): - text = "lorem ipsum 130218 lorem ipsum" - assert parse_date("", text) is None - - def test_date_format_2(self): - text = "lorem ipsum 2018 lorem ipsum" - assert parse_date("", text) is None - - def test_date_format_3(self): - text = "lorem ipsum 20180213 lorem ipsum" - assert parse_date("", text) is None - - def test_date_format_4(self, settings_timezone: ZoneInfo): - text = "lorem ipsum 13.02.2018 lorem ipsum" - date = parse_date("", text) - assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone) - - def test_date_format_5(self, settings_timezone: ZoneInfo): - text = "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem ipsum" - date = parse_date("", text) - assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone) - - def test_date_format_6(self): - text = ( - "lorem ipsum\n" - "Wohnort\n" - "3100\n" - "IBAN\n" - "AT87 4534\n" - "1234\n" - "1234 5678\n" - "BIC\n" - "lorem ipsum" - ) - assert parse_date("", text) is None - - def test_date_format_7( - self, - settings: SettingsWrapper, - settings_timezone: ZoneInfo, - ): - settings.DATE_PARSER_LANGUAGES = ["de"] - text = "lorem ipsum\nMärz 2019\nlorem ipsum" - date = parse_date("", text) - assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone) - - def test_date_format_8( - self, - settings: SettingsWrapper, - settings_timezone: ZoneInfo, - ): - settings.DATE_PARSER_LANGUAGES = ["de"] - text = ( - "lorem ipsum\n" - "Wohnort\n" - "3100\n" - "IBAN\n" - "AT87 4534\n" - "1234\n" - "1234 5678\n" - "BIC\n" - "lorem ipsum\n" - "März 2020" - ) - assert parse_date("", text) == datetime.datetime( - 2020, - 3, - 1, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_9( - self, - settings: SettingsWrapper, - settings_timezone: ZoneInfo, - ): - settings.DATE_PARSER_LANGUAGES = ["de"] - text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum" - assert parse_date("", text) == datetime.datetime( - 2020, - 3, - 1, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_10(self, settings_timezone: ZoneInfo): - text = "Customer Number Currency 22-MAR-2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 22, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_11(self, settings_timezone: ZoneInfo): - text = "Customer Number Currency 22 MAR 2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 22, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_12(self, settings_timezone: ZoneInfo): - text = "Customer Number Currency 22/MAR/2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 22, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_13(self, settings_timezone: ZoneInfo): - text = "Customer Number Currency 22.MAR.2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 22, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_14(self, settings_timezone: ZoneInfo): - text = "Customer Number Currency 22.MAR 2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 22, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_15(self): - text = "Customer Number Currency 22.MAR.22 Credit Card 1934829304" - assert parse_date("", text) is None - - def test_date_format_16(self): - text = "Customer Number Currency 22.MAR,22 Credit Card 1934829304" - assert parse_date("", text) is None - - def test_date_format_17(self): - text = "Customer Number Currency 22,MAR,2022 Credit Card 1934829304" - assert parse_date("", text) is None - - def test_date_format_18(self): - text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304" - assert parse_date("", text) is None - - def test_date_format_19(self, settings_timezone: ZoneInfo): - text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 21, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_20(self, settings_timezone: ZoneInfo): - text = "Customer Number Currency 22nd March 2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 22, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_21(self, settings_timezone: ZoneInfo): - text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 2, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_22(self, settings_timezone: ZoneInfo): - text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 23, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_23(self, settings_timezone: ZoneInfo): - text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 24, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_24(self, settings_timezone: ZoneInfo): - text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 21, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_25(self, settings_timezone: ZoneInfo): - text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304" - assert parse_date("", text) == datetime.datetime( - 2022, - 3, - 25, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_date_format_26(self, settings_timezone: ZoneInfo): - text = "CHASE 0 September 25, 2019 JPMorgan Chase Bank, NA. P0 Box 182051" - assert parse_date("", text) == datetime.datetime( - 2019, - 9, - 25, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_crazy_date_past(self): - assert parse_date("", "01-07-0590 00:00:00") is None - - def test_crazy_date_future(self): - assert parse_date("", "01-07-2350 00:00:00") is None - - def test_crazy_date_with_spaces(self): - assert parse_date("", "20 408000l 2475") is None - - def test_utf_month_names( - self, - settings: SettingsWrapper, - settings_timezone: ZoneInfo, - ): - settings.DATE_PARSER_LANGUAGES = ["fr", "de", "hr", "cs", "pl", "tr"] - assert parse_date("", "13 décembre 2023") == datetime.datetime( - 2023, - 12, - 13, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "13 août 2022") == datetime.datetime( - 2022, - 8, - 13, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "11 März 2020") == datetime.datetime( - 2020, - 3, - 11, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "17. ožujka 2018.") == datetime.datetime( - 2018, - 3, - 17, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "1. veljače 2016.") == datetime.datetime( - 2016, - 2, - 1, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "15. února 1985") == datetime.datetime( - 1985, - 2, - 15, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "30. září 2011") == datetime.datetime( - 2011, - 9, - 30, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "28. května 1990") == datetime.datetime( - 1990, - 5, - 28, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "1. grudzień 1997") == datetime.datetime( - 1997, - 12, - 1, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "17 Şubat 2024") == datetime.datetime( - 2024, - 2, - 17, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "30 Ağustos 2012") == datetime.datetime( - 2012, - 8, - 30, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "17 Eylül 2000") == datetime.datetime( - 2000, - 9, - 17, - 0, - 0, - tzinfo=settings_timezone, - ) - assert parse_date("", "5. október 1992") == datetime.datetime( - 1992, - 10, - 5, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_multiple_dates(self, settings_timezone: ZoneInfo): - text = """This text has multiple dates. - For example 02.02.2018, 22 July 2022 and December 2021. - But not 24-12-9999 because it's in the future...""" - dates = list(parse_date_generator("", text)) - - assert dates == [ - datetime.datetime(2018, 2, 2, 0, 0, tzinfo=settings_timezone), - datetime.datetime( - 2022, - 7, - 22, - 0, - 0, - tzinfo=settings_timezone, - ), - datetime.datetime( - 2021, - 12, - 1, - 0, - 0, - tzinfo=settings_timezone, - ), - ] - - def test_filename_date_parse_valid_ymd( - self, - settings: SettingsWrapper, - settings_timezone: ZoneInfo, - ): - """ - GIVEN: - - Date parsing from the filename is enabled - - Filename date format is with Year Month Day (YMD) - - Filename contains date matching the format - - THEN: - - Should parse the date from the filename - """ - settings.FILENAME_DATE_ORDER = "YMD" - - assert parse_date( - "/tmp/Scan-2022-04-01.pdf", - "No date in here", - ) == datetime.datetime(2022, 4, 1, 0, 0, tzinfo=settings_timezone) - - def test_filename_date_parse_valid_dmy( - self, - settings: SettingsWrapper, - settings_timezone: ZoneInfo, - ): - """ - GIVEN: - - Date parsing from the filename is enabled - - Filename date format is with Day Month Year (DMY) - - Filename contains date matching the format - - THEN: - - Should parse the date from the filename - """ - settings.FILENAME_DATE_ORDER = "DMY" - assert parse_date( - "/tmp/Scan-10.01.2021.pdf", - "No date in here", - ) == datetime.datetime(2021, 1, 10, 0, 0, tzinfo=settings_timezone) - - def test_filename_date_parse_invalid(self, settings: SettingsWrapper): - """ - GIVEN: - - Date parsing from the filename is enabled - - Filename includes no date - - File content includes no date - - THEN: - - No date is parsed - """ - settings.FILENAME_DATE_ORDER = "YMD" - assert parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here") is None - - def test_filename_date_ignored_use_content( - self, - settings: SettingsWrapper, - settings_timezone: ZoneInfo, - ): - """ - GIVEN: - - Date parsing from the filename is enabled - - Filename date format is with Day Month Year (YMD) - - Date order is Day Month Year (DMY, the default) - - Filename contains date matching the format - - Filename date is an ignored date - - File content includes a date - - THEN: - - Should parse the date from the content not filename - """ - settings.FILENAME_DATE_ORDER = "YMD" - settings.IGNORE_DATES = (datetime.date(2022, 4, 1),) - assert parse_date( - "/tmp/Scan-2022-04-01.pdf", - "The matching date is 24.03.2022", - ) == datetime.datetime(2022, 3, 24, 0, 0, tzinfo=settings_timezone) - - def test_ignored_dates_default_order( - self, - settings: SettingsWrapper, - settings_timezone: ZoneInfo, - ): - """ - GIVEN: - - Ignore dates have been set - - File content includes ignored dates - - File content includes 1 non-ignored date - - THEN: - - Should parse the date non-ignored date from content - """ - settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)) - text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem ipsum" - assert parse_date("", text) == datetime.datetime( - 2018, - 2, - 13, - 0, - 0, - tzinfo=settings_timezone, - ) - - def test_ignored_dates_order_ymd( - self, - settings: SettingsWrapper, - settings_timezone: ZoneInfo, - ): - """ - GIVEN: - - Ignore dates have been set - - Date order is Year Month Date (YMD) - - File content includes ignored dates - - File content includes 1 non-ignored date - - THEN: - - Should parse the date non-ignored date from content - """ - - settings.FILENAME_DATE_ORDER = "YMD" - settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17)) - - text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem ipsum" - - assert parse_date("", text) == datetime.datetime( - 2018, - 2, - 13, - 0, - 0, - tzinfo=settings_timezone, - ) diff --git a/src/documents/views.py b/src/documents/views.py index c0f3b5db4..bc785f61d 100644 --- a/src/documents/views.py +++ b/src/documents/views.py @@ -148,7 +148,6 @@ from documents.models import Workflow from documents.models import WorkflowAction from documents.models import WorkflowTrigger from documents.parsers import get_parser_class_for_mime_type -from documents.parsers import parse_date_generator from documents.permissions import AcknowledgeTasksPermissions from documents.permissions import PaperlessAdminPermissions from documents.permissions import PaperlessNotePermissions @@ -158,6 +157,7 @@ from documents.permissions import get_document_count_filter_for_user from documents.permissions import get_objects_for_user_owner_aware from documents.permissions import has_perms_owner_aware from documents.permissions import set_permissions_for_object +from documents.plugins.date_parsing import get_date_parser from documents.schema import generate_object_with_permissions_schema from documents.serialisers import AcknowledgeTasksViewSerializer from documents.serialisers import BulkDownloadSerializer @@ -1023,7 +1023,8 @@ class DocumentViewSet( dates = [] if settings.NUMBER_OF_SUGGESTED_DATES > 0: - gen = parse_date_generator(doc.filename, doc.content) + date_parser = get_date_parser() + gen = date_parser.parse(doc.filename, doc.content) dates = sorted( { i