mirror of
https://github.com/paperless-ngx/paperless-ngx.git
synced 2026-01-28 22:59:03 -06:00
Hooks up the class and fixes up the old testing. Includes ocr to date parser conversion we now do
This commit is contained in:
@@ -32,12 +32,12 @@ from documents.models import WorkflowTrigger
|
||||
from documents.parsers import DocumentParser
|
||||
from documents.parsers import ParseError
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.parsers import parse_date
|
||||
from documents.permissions import set_permissions_for_object
|
||||
from documents.plugins.base import AlwaysRunPluginMixin
|
||||
from documents.plugins.base import ConsumeTaskPlugin
|
||||
from documents.plugins.base import NoCleanupPluginMixin
|
||||
from documents.plugins.base import NoSetupPluginMixin
|
||||
from documents.plugins.date_parsing import get_date_parser
|
||||
from documents.plugins.helpers import ProgressManager
|
||||
from documents.plugins.helpers import ProgressStatusOptions
|
||||
from documents.signals import document_consumption_finished
|
||||
@@ -426,7 +426,8 @@ class ConsumerPlugin(
|
||||
ProgressStatusOptions.WORKING,
|
||||
ConsumerStatusShortMessage.PARSE_DATE,
|
||||
)
|
||||
date = parse_date(self.filename, text)
|
||||
date_parser = get_date_parser()
|
||||
date = next(date_parser.parse(self.filename, text), None)
|
||||
archive_path = document_parser.get_archive_path()
|
||||
page_count = document_parser.get_page_count(self.working_copy, mime_type)
|
||||
|
||||
|
||||
@@ -9,22 +9,17 @@ import subprocess
|
||||
import tempfile
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from re import Match
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
|
||||
from documents.loggers import LoggingMixin
|
||||
from documents.signals import document_consumer_declaration
|
||||
from documents.utils import copy_file_with_basic_stats
|
||||
from documents.utils import run_subprocess
|
||||
from paperless.config import OcrConfig
|
||||
from paperless.utils import ocr_to_dateparser_languages
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import datetime
|
||||
from collections.abc import Iterator
|
||||
|
||||
# This regular expression will try to find dates in the document at
|
||||
# hand and will match the following formats:
|
||||
@@ -259,75 +254,6 @@ def make_thumbnail_from_pdf(in_path: Path, temp_dir: Path, logging_group=None) -
|
||||
return out_path
|
||||
|
||||
|
||||
def parse_date(filename, text) -> datetime.datetime | None:
|
||||
return next(parse_date_generator(filename, text), None)
|
||||
|
||||
|
||||
def parse_date_generator(filename, text) -> Iterator[datetime.datetime]:
|
||||
"""
|
||||
Returns the date of the document.
|
||||
"""
|
||||
|
||||
def __parser(ds: str, date_order: str) -> datetime.datetime:
|
||||
"""
|
||||
Call dateparser.parse with a particular date ordering
|
||||
"""
|
||||
import dateparser
|
||||
|
||||
ocr_config = OcrConfig()
|
||||
languages = settings.DATE_PARSER_LANGUAGES or ocr_to_dateparser_languages(
|
||||
ocr_config.language,
|
||||
)
|
||||
|
||||
return dateparser.parse(
|
||||
ds,
|
||||
settings={
|
||||
"DATE_ORDER": date_order,
|
||||
"PREFER_DAY_OF_MONTH": "first",
|
||||
"RETURN_AS_TIMEZONE_AWARE": True,
|
||||
"TIMEZONE": settings.TIME_ZONE,
|
||||
},
|
||||
locales=languages,
|
||||
)
|
||||
|
||||
def __filter(date: datetime.datetime) -> datetime.datetime | None:
|
||||
if (
|
||||
date is not None
|
||||
and date.year > 1900
|
||||
and date <= timezone.now()
|
||||
and date.date() not in settings.IGNORE_DATES
|
||||
):
|
||||
return date
|
||||
return None
|
||||
|
||||
def __process_match(
|
||||
match: Match[str],
|
||||
date_order: str,
|
||||
) -> datetime.datetime | None:
|
||||
date_string = match.group(0)
|
||||
|
||||
try:
|
||||
date = __parser(date_string, date_order)
|
||||
except Exception:
|
||||
# Skip all matches that do not parse to a proper date
|
||||
date = None
|
||||
|
||||
return __filter(date)
|
||||
|
||||
def __process_content(content: str, date_order: str) -> Iterator[datetime.datetime]:
|
||||
for m in re.finditer(DATE_REGEX, content):
|
||||
date = __process_match(m, date_order)
|
||||
if date is not None:
|
||||
yield date
|
||||
|
||||
# if filename date parsing is enabled, search there first:
|
||||
if settings.FILENAME_DATE_ORDER:
|
||||
yield from __process_content(filename, settings.FILENAME_DATE_ORDER)
|
||||
|
||||
# Iterate through all regex matches in text and try to parse the date
|
||||
yield from __process_content(text, settings.DATE_ORDER)
|
||||
|
||||
|
||||
class ParseError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@ from django.utils import timezone
|
||||
from documents.plugins.date_parsing.base import DateParserConfig
|
||||
from documents.plugins.date_parsing.base import DateParserPluginBase
|
||||
from documents.plugins.date_parsing.regex_parser import RegexDateParserPlugin
|
||||
from paperless.utils import ocr_to_dateparser_languages
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -73,7 +74,10 @@ def get_date_parser() -> DateParserPluginBase:
|
||||
|
||||
# 2. Load configuration from settings
|
||||
# TODO: Get the language from the settings and/or configuration object, depending
|
||||
languages = settings.DATE_PARSER_LANGUAGES
|
||||
languages = languages = (
|
||||
settings.DATE_PARSER_LANGUAGES
|
||||
or ocr_to_dateparser_languages(settings.OCR_LANGUAGE)
|
||||
)
|
||||
|
||||
config = DateParserConfig(
|
||||
languages=languages,
|
||||
|
||||
@@ -1978,11 +1978,11 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
|
||||
response = self.client.get(f"/api/documents/{doc.pk}/suggestions/")
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
|
||||
@mock.patch("documents.parsers.parse_date_generator")
|
||||
@mock.patch("documents.views.get_date_parser")
|
||||
@override_settings(NUMBER_OF_SUGGESTED_DATES=0)
|
||||
def test_get_suggestions_dates_disabled(
|
||||
self,
|
||||
parse_date_generator,
|
||||
mock_get_date_parser: mock.MagicMock,
|
||||
):
|
||||
"""
|
||||
GIVEN:
|
||||
@@ -1999,7 +1999,8 @@ class TestDocumentApi(DirectoriesMixin, DocumentConsumeDelayMixin, APITestCase):
|
||||
)
|
||||
|
||||
self.client.get(f"/api/documents/{doc.pk}/suggestions/")
|
||||
self.assertFalse(parse_date_generator.called)
|
||||
|
||||
mock_get_date_parser.assert_not_called()
|
||||
|
||||
def test_saved_views(self):
|
||||
u1 = User.objects.create_superuser("user1")
|
||||
|
||||
@@ -1,538 +0,0 @@
|
||||
import datetime
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
import pytest
|
||||
from pytest_django.fixtures import SettingsWrapper
|
||||
|
||||
from documents.parsers import parse_date
|
||||
from documents.parsers import parse_date_generator
|
||||
|
||||
|
||||
@pytest.mark.django_db()
|
||||
class TestDate:
|
||||
def test_date_format_1(self):
|
||||
text = "lorem ipsum 130218 lorem ipsum"
|
||||
assert parse_date("", text) is None
|
||||
|
||||
def test_date_format_2(self):
|
||||
text = "lorem ipsum 2018 lorem ipsum"
|
||||
assert parse_date("", text) is None
|
||||
|
||||
def test_date_format_3(self):
|
||||
text = "lorem ipsum 20180213 lorem ipsum"
|
||||
assert parse_date("", text) is None
|
||||
|
||||
def test_date_format_4(self, settings_timezone: ZoneInfo):
|
||||
text = "lorem ipsum 13.02.2018 lorem ipsum"
|
||||
date = parse_date("", text)
|
||||
assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
|
||||
|
||||
def test_date_format_5(self, settings_timezone: ZoneInfo):
|
||||
text = "lorem ipsum 130218, 2018, 20180213 and lorem 13.02.2018 lorem ipsum"
|
||||
date = parse_date("", text)
|
||||
assert date == datetime.datetime(2018, 2, 13, 0, 0, tzinfo=settings_timezone)
|
||||
|
||||
def test_date_format_6(self):
|
||||
text = (
|
||||
"lorem ipsum\n"
|
||||
"Wohnort\n"
|
||||
"3100\n"
|
||||
"IBAN\n"
|
||||
"AT87 4534\n"
|
||||
"1234\n"
|
||||
"1234 5678\n"
|
||||
"BIC\n"
|
||||
"lorem ipsum"
|
||||
)
|
||||
assert parse_date("", text) is None
|
||||
|
||||
def test_date_format_7(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
settings_timezone: ZoneInfo,
|
||||
):
|
||||
settings.DATE_PARSER_LANGUAGES = ["de"]
|
||||
text = "lorem ipsum\nMärz 2019\nlorem ipsum"
|
||||
date = parse_date("", text)
|
||||
assert date == datetime.datetime(2019, 3, 1, 0, 0, tzinfo=settings_timezone)
|
||||
|
||||
def test_date_format_8(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
settings_timezone: ZoneInfo,
|
||||
):
|
||||
settings.DATE_PARSER_LANGUAGES = ["de"]
|
||||
text = (
|
||||
"lorem ipsum\n"
|
||||
"Wohnort\n"
|
||||
"3100\n"
|
||||
"IBAN\n"
|
||||
"AT87 4534\n"
|
||||
"1234\n"
|
||||
"1234 5678\n"
|
||||
"BIC\n"
|
||||
"lorem ipsum\n"
|
||||
"März 2020"
|
||||
)
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2020,
|
||||
3,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_9(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
settings_timezone: ZoneInfo,
|
||||
):
|
||||
settings.DATE_PARSER_LANGUAGES = ["de"]
|
||||
text = "lorem ipsum\n27. Nullmonth 2020\nMärz 2020\nlorem ipsum"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2020,
|
||||
3,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_10(self, settings_timezone: ZoneInfo):
|
||||
text = "Customer Number Currency 22-MAR-2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
22,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_11(self, settings_timezone: ZoneInfo):
|
||||
text = "Customer Number Currency 22 MAR 2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
22,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_12(self, settings_timezone: ZoneInfo):
|
||||
text = "Customer Number Currency 22/MAR/2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
22,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_13(self, settings_timezone: ZoneInfo):
|
||||
text = "Customer Number Currency 22.MAR.2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
22,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_14(self, settings_timezone: ZoneInfo):
|
||||
text = "Customer Number Currency 22.MAR 2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
22,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_15(self):
|
||||
text = "Customer Number Currency 22.MAR.22 Credit Card 1934829304"
|
||||
assert parse_date("", text) is None
|
||||
|
||||
def test_date_format_16(self):
|
||||
text = "Customer Number Currency 22.MAR,22 Credit Card 1934829304"
|
||||
assert parse_date("", text) is None
|
||||
|
||||
def test_date_format_17(self):
|
||||
text = "Customer Number Currency 22,MAR,2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) is None
|
||||
|
||||
def test_date_format_18(self):
|
||||
text = "Customer Number Currency 22 MAR,2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) is None
|
||||
|
||||
def test_date_format_19(self, settings_timezone: ZoneInfo):
|
||||
text = "Customer Number Currency 21st MAR 2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
21,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_20(self, settings_timezone: ZoneInfo):
|
||||
text = "Customer Number Currency 22nd March 2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
22,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_21(self, settings_timezone: ZoneInfo):
|
||||
text = "Customer Number Currency 2nd MAR 2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
2,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_22(self, settings_timezone: ZoneInfo):
|
||||
text = "Customer Number Currency 23rd MAR 2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
23,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_23(self, settings_timezone: ZoneInfo):
|
||||
text = "Customer Number Currency 24th MAR 2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
24,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_24(self, settings_timezone: ZoneInfo):
|
||||
text = "Customer Number Currency 21-MAR-2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
21,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_25(self, settings_timezone: ZoneInfo):
|
||||
text = "Customer Number Currency 25TH MAR 2022 Credit Card 1934829304"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2022,
|
||||
3,
|
||||
25,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_date_format_26(self, settings_timezone: ZoneInfo):
|
||||
text = "CHASE 0 September 25, 2019 JPMorgan Chase Bank, NA. P0 Box 182051"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2019,
|
||||
9,
|
||||
25,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_crazy_date_past(self):
|
||||
assert parse_date("", "01-07-0590 00:00:00") is None
|
||||
|
||||
def test_crazy_date_future(self):
|
||||
assert parse_date("", "01-07-2350 00:00:00") is None
|
||||
|
||||
def test_crazy_date_with_spaces(self):
|
||||
assert parse_date("", "20 408000l 2475") is None
|
||||
|
||||
def test_utf_month_names(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
settings_timezone: ZoneInfo,
|
||||
):
|
||||
settings.DATE_PARSER_LANGUAGES = ["fr", "de", "hr", "cs", "pl", "tr"]
|
||||
assert parse_date("", "13 décembre 2023") == datetime.datetime(
|
||||
2023,
|
||||
12,
|
||||
13,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "13 août 2022") == datetime.datetime(
|
||||
2022,
|
||||
8,
|
||||
13,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "11 März 2020") == datetime.datetime(
|
||||
2020,
|
||||
3,
|
||||
11,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "17. ožujka 2018.") == datetime.datetime(
|
||||
2018,
|
||||
3,
|
||||
17,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "1. veljače 2016.") == datetime.datetime(
|
||||
2016,
|
||||
2,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "15. února 1985") == datetime.datetime(
|
||||
1985,
|
||||
2,
|
||||
15,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "30. září 2011") == datetime.datetime(
|
||||
2011,
|
||||
9,
|
||||
30,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "28. května 1990") == datetime.datetime(
|
||||
1990,
|
||||
5,
|
||||
28,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "1. grudzień 1997") == datetime.datetime(
|
||||
1997,
|
||||
12,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "17 Şubat 2024") == datetime.datetime(
|
||||
2024,
|
||||
2,
|
||||
17,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "30 Ağustos 2012") == datetime.datetime(
|
||||
2012,
|
||||
8,
|
||||
30,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "17 Eylül 2000") == datetime.datetime(
|
||||
2000,
|
||||
9,
|
||||
17,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
assert parse_date("", "5. október 1992") == datetime.datetime(
|
||||
1992,
|
||||
10,
|
||||
5,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_multiple_dates(self, settings_timezone: ZoneInfo):
|
||||
text = """This text has multiple dates.
|
||||
For example 02.02.2018, 22 July 2022 and December 2021.
|
||||
But not 24-12-9999 because it's in the future..."""
|
||||
dates = list(parse_date_generator("", text))
|
||||
|
||||
assert dates == [
|
||||
datetime.datetime(2018, 2, 2, 0, 0, tzinfo=settings_timezone),
|
||||
datetime.datetime(
|
||||
2022,
|
||||
7,
|
||||
22,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
),
|
||||
datetime.datetime(
|
||||
2021,
|
||||
12,
|
||||
1,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
),
|
||||
]
|
||||
|
||||
def test_filename_date_parse_valid_ymd(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
settings_timezone: ZoneInfo,
|
||||
):
|
||||
"""
|
||||
GIVEN:
|
||||
- Date parsing from the filename is enabled
|
||||
- Filename date format is with Year Month Day (YMD)
|
||||
- Filename contains date matching the format
|
||||
|
||||
THEN:
|
||||
- Should parse the date from the filename
|
||||
"""
|
||||
settings.FILENAME_DATE_ORDER = "YMD"
|
||||
|
||||
assert parse_date(
|
||||
"/tmp/Scan-2022-04-01.pdf",
|
||||
"No date in here",
|
||||
) == datetime.datetime(2022, 4, 1, 0, 0, tzinfo=settings_timezone)
|
||||
|
||||
def test_filename_date_parse_valid_dmy(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
settings_timezone: ZoneInfo,
|
||||
):
|
||||
"""
|
||||
GIVEN:
|
||||
- Date parsing from the filename is enabled
|
||||
- Filename date format is with Day Month Year (DMY)
|
||||
- Filename contains date matching the format
|
||||
|
||||
THEN:
|
||||
- Should parse the date from the filename
|
||||
"""
|
||||
settings.FILENAME_DATE_ORDER = "DMY"
|
||||
assert parse_date(
|
||||
"/tmp/Scan-10.01.2021.pdf",
|
||||
"No date in here",
|
||||
) == datetime.datetime(2021, 1, 10, 0, 0, tzinfo=settings_timezone)
|
||||
|
||||
def test_filename_date_parse_invalid(self, settings: SettingsWrapper):
|
||||
"""
|
||||
GIVEN:
|
||||
- Date parsing from the filename is enabled
|
||||
- Filename includes no date
|
||||
- File content includes no date
|
||||
|
||||
THEN:
|
||||
- No date is parsed
|
||||
"""
|
||||
settings.FILENAME_DATE_ORDER = "YMD"
|
||||
assert parse_date("/tmp/20 408000l 2475 - test.pdf", "No date in here") is None
|
||||
|
||||
def test_filename_date_ignored_use_content(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
settings_timezone: ZoneInfo,
|
||||
):
|
||||
"""
|
||||
GIVEN:
|
||||
- Date parsing from the filename is enabled
|
||||
- Filename date format is with Day Month Year (YMD)
|
||||
- Date order is Day Month Year (DMY, the default)
|
||||
- Filename contains date matching the format
|
||||
- Filename date is an ignored date
|
||||
- File content includes a date
|
||||
|
||||
THEN:
|
||||
- Should parse the date from the content not filename
|
||||
"""
|
||||
settings.FILENAME_DATE_ORDER = "YMD"
|
||||
settings.IGNORE_DATES = (datetime.date(2022, 4, 1),)
|
||||
assert parse_date(
|
||||
"/tmp/Scan-2022-04-01.pdf",
|
||||
"The matching date is 24.03.2022",
|
||||
) == datetime.datetime(2022, 3, 24, 0, 0, tzinfo=settings_timezone)
|
||||
|
||||
def test_ignored_dates_default_order(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
settings_timezone: ZoneInfo,
|
||||
):
|
||||
"""
|
||||
GIVEN:
|
||||
- Ignore dates have been set
|
||||
- File content includes ignored dates
|
||||
- File content includes 1 non-ignored date
|
||||
|
||||
THEN:
|
||||
- Should parse the date non-ignored date from content
|
||||
"""
|
||||
settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
|
||||
text = "lorem ipsum 110319, 20200117 and lorem 13.02.2018 lorem ipsum"
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2018,
|
||||
2,
|
||||
13,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
|
||||
def test_ignored_dates_order_ymd(
|
||||
self,
|
||||
settings: SettingsWrapper,
|
||||
settings_timezone: ZoneInfo,
|
||||
):
|
||||
"""
|
||||
GIVEN:
|
||||
- Ignore dates have been set
|
||||
- Date order is Year Month Date (YMD)
|
||||
- File content includes ignored dates
|
||||
- File content includes 1 non-ignored date
|
||||
|
||||
THEN:
|
||||
- Should parse the date non-ignored date from content
|
||||
"""
|
||||
|
||||
settings.FILENAME_DATE_ORDER = "YMD"
|
||||
settings.IGNORE_DATES = (datetime.date(2019, 11, 3), datetime.date(2020, 1, 17))
|
||||
|
||||
text = "lorem ipsum 190311, 20200117 and lorem 13.02.2018 lorem ipsum"
|
||||
|
||||
assert parse_date("", text) == datetime.datetime(
|
||||
2018,
|
||||
2,
|
||||
13,
|
||||
0,
|
||||
0,
|
||||
tzinfo=settings_timezone,
|
||||
)
|
||||
@@ -148,7 +148,6 @@ from documents.models import Workflow
|
||||
from documents.models import WorkflowAction
|
||||
from documents.models import WorkflowTrigger
|
||||
from documents.parsers import get_parser_class_for_mime_type
|
||||
from documents.parsers import parse_date_generator
|
||||
from documents.permissions import AcknowledgeTasksPermissions
|
||||
from documents.permissions import PaperlessAdminPermissions
|
||||
from documents.permissions import PaperlessNotePermissions
|
||||
@@ -158,6 +157,7 @@ from documents.permissions import get_document_count_filter_for_user
|
||||
from documents.permissions import get_objects_for_user_owner_aware
|
||||
from documents.permissions import has_perms_owner_aware
|
||||
from documents.permissions import set_permissions_for_object
|
||||
from documents.plugins.date_parsing import get_date_parser
|
||||
from documents.schema import generate_object_with_permissions_schema
|
||||
from documents.serialisers import AcknowledgeTasksViewSerializer
|
||||
from documents.serialisers import BulkDownloadSerializer
|
||||
@@ -1023,7 +1023,8 @@ class DocumentViewSet(
|
||||
|
||||
dates = []
|
||||
if settings.NUMBER_OF_SUGGESTED_DATES > 0:
|
||||
gen = parse_date_generator(doc.filename, doc.content)
|
||||
date_parser = get_date_parser()
|
||||
gen = date_parser.parse(doc.filename, doc.content)
|
||||
dates = sorted(
|
||||
{
|
||||
i
|
||||
|
||||
Reference in New Issue
Block a user